In [1]:
import numpy as np
import pandas as pd

First, let's read in data and create appropriate variables for the different types of IPV and the weight variable.

Codebook notes: 

- d005 = weight
- d114 = how often partner drinks
- d107 = experience of severe physical violence
- d106 = experience of less severe physical violence
- d104 = experience of emotional violence
- d108 = experience of sexual violence

In [2]:
# Read data from Stata file
data = pd.read_stata('/Users/allisontowey/Downloads/UA_2007_DHS_03022023_024_188135/UAIR51DT/UAIR51FL.DTA', convert_categoricals=False, convert_missing=True)

# Replace ".z" with NaN in d005 column
data['d005'] = pd.to_numeric(data['d005'], errors='coerce')

# Generate weights. Use concat to reduce fragmentation warning.
data = pd.concat([data, pd.Series(data['d005'].values / 1000000, name='wgt')], axis=1)

# Subset our data to if the respondent answered how often their husband drinks (d114)
data = data[data['d114'].isin([0, 1, 2])]

# Combine severe and less severe physical abuse
data['severephys'] = ~data['d107'].isin([0, 9])
data['lessseverephys'] = ~data['d106'].isin([0, 9])
data['anyphys'] = data[['severephys', 'lessseverephys']].any(axis=1)

# Grab emotional and sexual abuse
data['emo'] = data['d104'] == 1
data['sexual'] = data['d108'] == 1

# Create combinations using logical operations
data['sexual_only'] = ~data['emo'] & data['sexual'] & ~data['anyphys']
data['physical_only'] = ~data['emo'] & ~data['sexual'] & data['anyphys']
data['emotional_only'] = data['emo'] & ~data['sexual'] & ~data['anyphys']

data['all_types'] = data['emo'] & data['sexual'] & data['anyphys']
data['phys_sex'] = ~data['emo'] & data['sexual'] & data['anyphys']
data['phys_emo'] = data['emo'] & ~data['sexual'] & data['anyphys']
data['sex_emo'] = data['emo'] & data['sexual'] & ~data['anyphys']

In [3]:
data['d114'] = data['d114'].replace({0: 'never', 2: 'sometimes', 1: 'often'})

Let's explore what we have for data by cross tabulating the d114 variable, denoting how often their partner drinks, with their experiences of IPV.

First, create a function and an input list that contains all of our created variables.

In [4]:
def create_crosstab(row_vars):
    dfs = []
    for row_var in row_vars:
        crosstab_df = pd.crosstab(data[row_var], data['d114'], values=data['wgt'], aggfunc=sum, dropna=False)
        crosstab_df = crosstab_df.loc[True].to_frame().reset_index().rename(columns={True: row_var}).set_index('d114')
        dfs.append(crosstab_df)
    result = pd.concat(dfs, axis=1)
    return result


In [5]:
lst = ['all_types', 'phys_sex', 'phys_emo', 'sex_emo', 'sexual_only', 'physical_only', 'emotional_only']
lst

['all_types',
 'phys_sex',
 'phys_emo',
 'sex_emo',
 'sexual_only',
 'physical_only',
 'emotional_only']

In [6]:
crosstab = create_crosstab(lst)

In [7]:
crosstab

Unnamed: 0_level_0,all_types,phys_sex,phys_emo,sex_emo,sexual_only,physical_only,emotional_only
d114,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
never,,,8.164097,,,2.333086,20.912317
often,37.136268,5.620653,86.59257,1.725297,0.770897,9.892501,38.826943
sometimes,17.134439,2.402444,96.649553,0.331862,1.452883,18.583953,151.426117


Now we have the weighted frequency values for all types of IPV separated by how often their partner drinks. We want to know the total weighted sum for, in total, how often partners drink.

In [8]:
weighted_counts = pd.DataFrame(data.groupby('d114')['wgt'].sum())
weighted_counts = weighted_counts.rename(columns={'wgt': 'total'})
weighted_counts

Unnamed: 0_level_0,total
d114,Unnamed: 1_level_1
never,436.457147
often,248.805814
sometimes,1163.525497


Concatenate that with our values for all of the IPV types individually.

In [9]:
df_withtotal = pd.concat([crosstab, weighted_counts], axis=1)

In [10]:
df_withtotal = df_withtotal.fillna(0)
df_withtotal

Unnamed: 0_level_0,all_types,phys_sex,phys_emo,sex_emo,sexual_only,physical_only,emotional_only,total
d114,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
never,0.0,0.0,8.164097,0.0,0.0,2.333086,20.912317,436.457147
often,37.136268,5.620653,86.59257,1.725297,0.770897,9.892501,38.826943,248.805814
sometimes,17.134439,2.402444,96.649553,0.331862,1.452883,18.583953,151.426117,1163.525497


Now we want to see how many women in each 'partner drink' category do not experience any violence. Add that to the dataframe.

In [11]:
df_withtotal['none'] = df_withtotal['total'] - df_withtotal.loc[:, df_withtotal.columns != 'total'].sum(axis=1)
df_withtotal

Unnamed: 0_level_0,all_types,phys_sex,phys_emo,sex_emo,sexual_only,physical_only,emotional_only,total,none
d114,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
never,0.0,0.0,8.164097,0.0,0.0,2.333086,20.912317,436.457147,405.047647
often,37.136268,5.620653,86.59257,1.725297,0.770897,9.892501,38.826943,248.805814,68.240685
sometimes,17.134439,2.402444,96.649553,0.331862,1.452883,18.583953,151.426117,1163.525497,875.544246


We have 1000 boxes to display this information in our visualization. Let's see how many boxes per IPV/'partner drinks' group we need.

In [12]:
#Make sure we are working with float values for all columns
df_withtotal = df_withtotal.astype(float)

In [13]:
#Create a copy of the dataframe so we can input percentages of the total
df_percent = df_withtotal.copy()

Now we want to see how many dots (squares) for our visualization. First, we can calculate the percentage of the total sample each crosstab value is and then multiple that by our total number of boxes (1000).

In [14]:
for row in df_withtotal:
    df_withtotal['total_percent'] = df_withtotal['total']/1848.79
    df_withtotal['total_dots'] = df_withtotal['total_percent']*1000

In [15]:
for col in df_withtotal.columns:
    if col != 'total' and col != 'total_dots' and col != 'total_percent':
        df_percent[col] = ((df_withtotal[col]/df_withtotal['total']) * df_withtotal['total_percent']*1000)


In [16]:
df_percent

Unnamed: 0_level_0,all_types,phys_sex,phys_emo,sex_emo,sexual_only,physical_only,emotional_only,total,none
d114,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
never,0.0,0.0,4.415914,0.0,0.0,1.261953,11.311353,436.457147,219.087969
often,20.086796,3.040179,46.837429,0.933203,0.416974,5.350798,21.001273,248.805814,36.910999
sometimes,9.267921,1.299468,52.277194,0.179502,0.785856,10.051955,81.905526,1163.525497,473.576905


Now we have a split of how many boxes we need for each group. Let's round to even integers. When we have boxes that represent more than 0 women but not enough to constitue 1 box, we still represent as 1 box. 

In [17]:
df = df_percent.applymap(lambda x: np.ceil(x) if x > 0 and x < .5 else np.round(x))


In [19]:
df = df.astype(int)
df = df.drop(columns=['total'])
df

Unnamed: 0_level_0,all_types,phys_sex,phys_emo,sex_emo,sexual_only,physical_only,emotional_only,none
d114,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
never,0,0,4,0,0,1,11,219
often,20,3,47,1,1,5,21,37
sometimes,9,1,52,1,1,10,82,474


Done! Now we have the number of boxes needed to visualize the data. (See the HTML/JS documentation for replication code of viz)