In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Specify the file path
file_path = 'hcmst2017to2022.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Change string numeric values to actual numeric values
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Display the DataFrame
#print(df.columns)

# Specify the column names you want to print
columns_to_print = ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w1_partyid7', 'w1_q12']

# Print the specified columns
print(df[columns_to_print])

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
0          53001           1         2.0         3.0          6.0     1.0
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
3         121759           1         1.0         NaN          2.0     4.0
4         158083           2         6.0         6.0          3.0     2.0
...          ...         ...         ...         ...          ...     ...
3505     2967957           1         NaN         NaN          7.0     6.0
3506     2968357           1         NaN         NaN          3.0     4.0
3507     2968971           1         NaN         NaN          5.0     4.0
3508     2969933           1         NaN         NaN          6.0     7.0
3509     2972135           1         NaN         NaN          3.0     5.0

[3510 rows x 6 columns]


In [10]:
# looking at possible values for each col
print(df['w1_section'].unique())  # possible values for relationship status in wave 1
print(df['w2_section'].unique())  # possible values for relationship status in wave 2
print(df['w3_section'].unique())  # possible values for relationship status in wave 3
print(df['w1_partyid7'].unique()) # possible values for respondent's political party
print(df['w1_q12'].unique()) # possible values for partner's political party


[1 2 3]
[ 2.  1.  6.  3. nan  5.  4.]
[ 3.  1. nan  6.  5.  4.  2.]
[ 6.  3.  7.  2.  1.  5. nan  4.]
[ 1.  3.  7.  4.  2.  6.  5. nan -1.]


In [11]:
# lets start by looking at all the couples that have stayed together through wave 1, 2, and 3

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 


# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5)

# Filter rows based on the combined condition and print the specified columns
filtered_rows = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rows)

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
8         214227           1         1.0         1.0          3.0     6.0
...          ...         ...         ...         ...          ...     ...
3438     2958163           1         1.0         1.0          3.0     3.0
3448     2959699           1         1.0         1.0          5.0     6.0
3459     2961175           1         1.0         1.0          2.0     6.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[1096 rows x 6 columns]


In [12]:
# let's look at homophily in couples based on sharing the same political ideology or leaning
# the following is for those who have the same exact 'score' on the survey's political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
condition6 = df['w1_partyid7'] == df['w1_q12'] # same political leaning 'score'

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & condition6

# Filter rows based on the combined condition and print the specified columns
filtered_rowsHomophily = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsHomophily)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
9         218351           1         1.0         1.0          2.0     2.0
...          ...         ...         ...         ...          ...     ...
3338     2943301           1         3.0         3.0          5.0     5.0
3402     2951689           1         1.0         1.0          7.0     7.0
3438     2958163           1         1.0         1.0          3.0     3.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[408 rows x 6 columns]


In [15]:
# from above - 408 (near half) of the 1096 couples that stayed together through the 3 waves scored the exact same

# so now let's look at couples who are the furthest away from each other on the political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 

# filtering for political party affiliation 1 step away
condition6 = (df['w1_partyid7'] - df['w1_q12']) == 1
# and vice versa
condition7 = (df['w1_q12'] - df['w1_partyid7']) == 1

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & (condition6 | condition7)

# Filter rows based on the combined condition and print the specified columns
filtered_rowsOutliers = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsOutliers)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
20        516823           1         1.0         1.0          2.0     1.0
42        632253           1         1.0         1.0          5.0     4.0
44        637531           1         1.0         1.0          6.0     5.0
50        646157           1         1.0         1.0          3.0     4.0
53        650237           1         1.0         1.0          3.0     2.0
...          ...         ...         ...         ...          ...     ...
3344     2943697           1         3.0         3.0          3.0     4.0
3358     2945353           1         1.0         1.0          7.0     6.0
3362     2946347           1         1.0         1.0          6.0     5.0
3387     2948891           1         1.0         1.0          6.0     7.0
3448     2959699           1         1.0         1.0          5.0     6.0

[323 rows x 6 columns]


In [16]:
print(filtered_rowsOutliers.shape[0])
outlierArr = filtered_rowsOutliers['caseid_new'].values
print(np.array2string(outlierArr, separator=','))

323
[ 516823, 632253, 637531, 646157, 650237, 651259, 709075, 727775, 730391,
  731249, 783399, 783679, 811175, 814285, 815831, 854869, 859445, 867567,
  874193, 892115, 900463, 908663, 948417, 959621, 962359, 981251,1000407,
 1001637,1119259,1123417,1129859,1145895,1150579,1168011,1181279,1182387,
 1183381,1196757,1224777,1230191,1234663,1239479,1246677,1246869,1250651,
 1256141,1262965,1267611,1272909,1282581,1283019,1287847,1300409,1301087,
 1344091,1348915,1372419,1374661,1377765,1385783,1394211,1409151,1422915,
 1433383,1445757,1465133,1513043,1513047,1519185,1527623,1548291,1550311,
 1550999,1551703,1572635,1572839,1587197,1593839,1604541,1607815,1626763,
 1630669,1631303,1634959,1638615,1645953,1648699,1677291,1692681,1698683,
 1712517,1718337,1730087,1731569,1761763,1763355,1765905,1768233,1780607,
 1785549,1788973,1791495,1793997,1807189,1814173,1816407,1825937,1831591,
 1834561,1846375,1850929,1851973,1856209,1857323,1858747,1860057,1860737,
 1860835,1869479,1873259,1877643,1

In [7]:
# here we can see that 323 out of 1096 couples that stayed together through the 3 waves were on 2 steps away on the political spectrum
# let's export to look at the indiv attributes


condition1 = df['caseid_new'].isin(outlierArr)
filtered_rowsOutliersAllCols = df.loc[condition1]
print(filtered_rowsOutliersAllCols)
filtered_rowsOutliersAllCols.to_csv('outliers_politicalparty_1_6.csv', index=False)

      caseid_new  w3_Weight  w3_Weight_LGB  w3_combo_weight  \
20        516823     0.6230            NaN         0.697823   
42        632253     0.7803            NaN         0.874014   
44        637531     2.0142            NaN         2.256106   
50        646157     0.6193            NaN         0.693678   
53        650237     0.9995            NaN         1.119540   
...          ...        ...            ...              ...   
3344     2943697     1.2699            NaN         1.422416   
3358     2945353     1.8949            NaN         2.122478   
3362     2946347     0.7681            NaN         0.860349   
3387     2948891     0.4924            NaN         0.551538   
3448     2959699     0.7411            NaN         0.830106   

      w3_attrition_adj_weight  w2_weight_genpop  w2_weight_LGB  \
20                   0.745254            0.5923            NaN   
42                   0.657640            1.0205            NaN   
44                   1.977244            1.55

In [8]:
column_averagesOutliers = filtered_rowsOutliersAllCols.median()

print("Average of each column:")
print(column_averagesOutliers)
column_averagesOutliers.to_csv('outliersMedianValues_politicalparty.csv', index=False)

#column_frequenciesOutliers = filtered_rowsOutliersAllCols.apply(pd.Series.value_counts)
#column_frequenciesOutliers.to_csv('outliersValuesFrequencies_politicalparty.csv', index=False)

Average of each column:
caseid_new                 2.053645e+06
w3_Weight                  7.821000e-01
w3_Weight_LGB              4.163000e-01
w3_combo_weight            8.419796e-01
w3_attrition_adj_weight    6.892316e-01
                               ...     
p20_pppa1905               0.000000e+00
p20_pppa1648               2.000000e+00
p20_ppp20072               5.000000e+00
p20_ppp20071               2.000000e+00
p20_ppp2date2020           2.021051e+07
Length: 725, dtype: float64
