In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Specify the file path
file_path = 'hcmst2017to2022.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Change string numeric values to actual numeric values
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Display the DataFrame
#print(df.columns)

# Specify the column names you want to print
columns_to_print = ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w1_partyid7', 'w1_q12']

# Print the specified columns
print(df[columns_to_print])

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
0          53001           1         2.0         3.0          6.0     1.0
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
3         121759           1         1.0         NaN          2.0     4.0
4         158083           2         6.0         6.0          3.0     2.0
...          ...         ...         ...         ...          ...     ...
3505     2967957           1         NaN         NaN          7.0     6.0
3506     2968357           1         NaN         NaN          3.0     4.0
3507     2968971           1         NaN         NaN          5.0     4.0
3508     2969933           1         NaN         NaN          6.0     7.0
3509     2972135           1         NaN         NaN          3.0     5.0

[3510 rows x 6 columns]


In [15]:
# looking at possible values for each col
print(df['w1_section'].unique())  # possible values for relationship status in wave 1
print(df['w2_section'].unique())  # possible values for relationship status in wave 2
print(df['w3_section'].unique())  # possible values for relationship status in wave 3
print(df['w1_partyid7'].unique()) # possible values for respondent's political party
print(df['w1_q12'].unique()) # possible values for partner's political party


[1 2 3]
[ 2.  1.  6.  3. nan  5.  4.]
[ 3.  1. nan  6.  5.  4.  2.]
[ 6.  3.  7.  2.  1.  5. nan  4.]
[ 1.  3.  7.  4.  2.  6.  5. nan -1.]


In [16]:
# lets start by looking at all the couples that have stayed together through wave 1, 2, and 3

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 


# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5)

# Filter rows based on the combined condition and print the specified columns
filtered_rows = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rows)

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
8         214227           1         1.0         1.0          3.0     6.0
...          ...         ...         ...         ...          ...     ...
3438     2958163           1         1.0         1.0          3.0     3.0
3448     2959699           1         1.0         1.0          5.0     6.0
3459     2961175           1         1.0         1.0          2.0     6.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[1096 rows x 6 columns]


In [17]:
# let's look at homophily in couples based on sharing the same political ideology or leaning
# the following is for those who have the same exact 'score' on the survey's political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
condition6 = df['w1_partyid7'] == df['w1_q12'] # same political leaning 'score'

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & condition6

# Filter rows based on the combined condition and print the specified columns
filtered_rowsHomophily = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsHomophily)
outlierArr = filtered_rowsHomophily['caseid_new'].values
print(np.array2string(outlierArr, separator=','))
#column_frequenciesHomophily = filtered_rowsHomophilyAllCols.apply(pd.Series.value_counts)
#column_frequenciesHomophily.to_csv('homophilValuesFrequencies_politicalparty.csv', index=False)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
9         218351           1         1.0         1.0          2.0     2.0
...          ...         ...         ...         ...          ...     ...
3338     2943301           1         3.0         3.0          5.0     5.0
3402     2951689           1         1.0         1.0          7.0     7.0
3438     2958163           1         1.0         1.0          3.0     3.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[408 rows x 6 columns]
[  71609, 106983, 164061, 212249, 218351, 369975, 428211, 589881, 608697,
  634833, 6435

In [18]:
# from above - 408 (near half) of the 1096 couples that stayed together through the 3 waves scored the exact same

# so now let's look at couples who are the furthest away from each other on the political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 

# filtering for political party affiliation most steps away
condition6 = (df['w1_partyid7'] - df['w1_q12']) == 6
# and vice versa
condition7 = (df['w1_q12'] - df['w1_partyid7']) == 6

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & (condition6 | condition7)
# Filter rows based on the combined condition and print the specified columns
filtered_rowsOutliers = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsOutliers)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1027     1908725           1         1.0         1.0          7.0     1.0
1244     1991077           1         1.0         1.0          7.0     1.0
1415     2101713           1         3.0         3.0          7.0     1.0
2046     2437991           1         1.0         1.0          7.0     1.0
2391     2634995           1         1.0         1.0          1.0     7.0
2599     2737695           1         3.0         3.0          7.0     1.0
2620     2753941           1         1.0         1.0          7.0     1.0
2678     2774941           1         1.0         1.0          7.0     1.0
3050     2883019           1         3.0         3.0          1.0     7.0


In [13]:
print(filtered_rowsOutliers.shape[0])
outlierArr = filtered_rowsOutliers['caseid_new'].values
print(np.array2string(outlierArr, separator=','))

9
[1908725,1991077,2101713,2437991,2634995,2737695,2753941,2774941,2883019]


In [7]:
# here we can see that only 9 out of 1096 couples that stayed together through the 3 waves were on opposite sides of the spectrum
# let's export to look at the indiv attributes


condition1 = df['caseid_new'].isin(outlierArr)
filtered_rowsOutliersAllCols = df.loc[condition1]
print(filtered_rowsOutliersAllCols)
filtered_rowsOutliersAllCols.to_csv('outliers_politicalparty_1_1.csv', index=False)

      caseid_new  w3_Weight  w3_Weight_LGB  w3_combo_weight  \
1027     1908725     2.1750            NaN         2.436219   
1244     1991077        NaN         1.4987         0.527574   
1415     2101713     0.9913         0.7212         0.348959   
2046     2437991     0.7224            NaN         0.809161   
2391     2634995     1.4355            NaN         1.607904   
2599     2737695     4.1723            NaN         4.673395   
2620     2753941     0.7006            NaN         0.784742   
2678     2774941        NaN         0.6802         0.239445   
3050     2883019     0.5122            NaN         0.573716   

      w3_attrition_adj_weight  w2_weight_genpop  w2_weight_LGB  \
1027                 2.737787            1.1976            NaN   
1244                 0.455181               NaN         1.0887   
1415                 0.737223            0.8002         0.4231   
2046                 0.679246            0.5829            NaN   
2391                 1.567201          