In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Specify the file path
file_path = 'hcmst2017to2022.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Change string numeric values to actual numeric values
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Display the DataFrame
#print(df.columns)

# Specify the column names you want to print
columns_to_print = ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w1_partyid7', 'w1_q12']

# Print the specified columns
print(df[columns_to_print])

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
0          53001           1         2.0         3.0          6.0     1.0
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
3         121759           1         1.0         NaN          2.0     4.0
4         158083           2         6.0         6.0          3.0     2.0
...          ...         ...         ...         ...          ...     ...
3505     2967957           1         NaN         NaN          7.0     6.0
3506     2968357           1         NaN         NaN          3.0     4.0
3507     2968971           1         NaN         NaN          5.0     4.0
3508     2969933           1         NaN         NaN          6.0     7.0
3509     2972135           1         NaN         NaN          3.0     5.0

[3510 rows x 6 columns]


In [10]:
# looking at possible values for each col
print(df['w1_section'].unique())  # possible values for relationship status in wave 1
print(df['w2_section'].unique())  # possible values for relationship status in wave 2
print(df['w3_section'].unique())  # possible values for relationship status in wave 3
print(df['w1_partyid7'].unique()) # possible values for respondent's political party
print(df['w1_q12'].unique()) # possible values for partner's political party


[1 2 3]
[ 2.  1.  6.  3. nan  5.  4.]
[ 3.  1. nan  6.  5.  4.  2.]
[ 6.  3.  7.  2.  1.  5. nan  4.]
[ 1.  3.  7.  4.  2.  6.  5. nan -1.]


In [11]:
# lets start by looking at all the couples that have stayed together through wave 1, 2, and 3

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 


# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5)

# Filter rows based on the combined condition and print the specified columns
filtered_rows = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rows)

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
8         214227           1         1.0         1.0          3.0     6.0
...          ...         ...         ...         ...          ...     ...
3438     2958163           1         1.0         1.0          3.0     3.0
3448     2959699           1         1.0         1.0          5.0     6.0
3459     2961175           1         1.0         1.0          2.0     6.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[1096 rows x 6 columns]


In [12]:
# let's look at homophily in couples based on sharing the same political ideology or leaning
# the following is for those who have the same exact 'score' on the survey's political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
condition6 = df['w1_partyid7'] == df['w1_q12'] # same political leaning 'score'

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & condition6

# Filter rows based on the combined condition and print the specified columns
filtered_rowsHomophily = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsHomophily)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
9         218351           1         1.0         1.0          2.0     2.0
...          ...         ...         ...         ...          ...     ...
3338     2943301           1         3.0         3.0          5.0     5.0
3402     2951689           1         1.0         1.0          7.0     7.0
3438     2958163           1         1.0         1.0          3.0     3.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[408 rows x 6 columns]


In [13]:
# from above - 408 (near half) of the 1096 couples that stayed together through the 3 waves scored the exact same

# so now let's look at couples who are the furthest away from each other on the political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 

# filtering for political party affiliation 3 steps away (the "next" furthest distance on the spectrum after 4 steps (ie 6-1))
condition6 = (df['w1_partyid7'] - df['w1_q12']) == 3
# and vice versa
condition7 = (df['w1_q12'] - df['w1_partyid7']) == 3

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & (condition6 | condition7)

# Filter rows based on the combined condition and print the specified columns
filtered_rowsOutliers = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsOutliers)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
8         214227           1         1.0         1.0          3.0     6.0
22        582849           1         1.0         1.0          1.0     4.0
25        587125           1         1.0         1.0          6.0     3.0
64        662355           1         1.0         1.0          7.0     4.0
165       844189           1         3.0         3.0          2.0     5.0
...          ...         ...         ...         ...          ...     ...
3158     2910231           1         1.0         1.0          6.0     3.0
3162     2910943           1         1.0         1.0          3.0     6.0
3203     2917189           1         1.0         1.0          7.0     4.0
3398     2950149           1         3.0         1.0          5.0     2.0
3401     2951535           1         3.0         3.0          7.0     4.0

[77 rows x 6 columns]


In [14]:
print(filtered_rowsOutliers.shape[0])
outlierArr = filtered_rowsOutliers['caseid_new'].values
print(outlierArr)
print(np.array2string(outlierArr,separator=','))

77
[ 214227  582849  587125  662355  844189  855297  867449  964713  981685
  982323  983511 1152229 1170887 1193613 1203831 1236545 1269605 1338065
 1374845 1378293 1418329 1439067 1612001 1679339 1682591 1692049 1718309
 1734941 1740893 1754559 1761743 1808717 1809325 1835975 1890779 1938115
 1961301 1966851 2013981 2017641 2027307 2071877 2086637 2111275 2148015
 2166521 2210355 2227047 2232397 2233103 2271349 2289087 2294693 2423007
 2440417 2461283 2480385 2555355 2630193 2642425 2649847 2658923 2700049
 2751707 2765643 2832811 2838801 2845837 2846681 2854677 2893071 2909343
 2910231 2910943 2917189 2950149 2951535]
[ 214227, 582849, 587125, 662355, 844189, 855297, 867449, 964713, 981685,
  982323, 983511,1152229,1170887,1193613,1203831,1236545,1269605,1338065,
 1374845,1378293,1418329,1439067,1612001,1679339,1682591,1692049,1718309,
 1734941,1740893,1754559,1761743,1808717,1809325,1835975,1890779,1938115,
 1961301,1966851,2013981,2017641,2027307,2071877,2086637,2111275,2148015,
 

In [7]:
# here we can see that 77 out of 1096 couples that stayed together through the 3 waves were on 3 steps away on the political spectrum
# let's export to look at the indiv attributes


condition1 = df['caseid_new'].isin(outlierArr)
filtered_rowsOutliersAllCols = df.loc[condition1]
print(filtered_rowsOutliersAllCols)
filtered_rowsOutliersAllCols.to_csv('outliers_politicalparty_1_5.csv', index=False)

      caseid_new  w3_Weight  w3_Weight_LGB  w3_combo_weight  \
8         214227     0.8591            NaN         0.962278   
22        582849     1.1820            NaN         1.323959   
25        587125     0.6449            NaN         0.722353   
64        662355        NaN         0.8700         0.306258   
165       844189     0.6965            NaN         0.780150   
...          ...        ...            ...              ...   
3158     2910231     0.7536            NaN         0.844108   
3162     2910943     0.9197            NaN         1.030156   
3203     2917189     0.6086         0.8533         0.214240   
3398     2950149     1.0976            NaN         1.229422   
3401     2951535        NaN         1.6789         0.591008   

      w3_attrition_adj_weight  w2_weight_genpop  w2_weight_LGB  \
8                    0.886445            0.7040            NaN   
22                   1.105244            1.1743            NaN   
25                   0.502589            0.63

In [8]:
column_averagesOutliers = filtered_rowsOutliersAllCols.median()

print("Average of each column:")
print(column_averagesOutliers)
column_averagesOutliers.to_csv('outliersMedianValues_politicalparty.csv', index=False)

#column_frequenciesOutliers = filtered_rowsOutliersAllCols.apply(pd.Series.value_counts)
#column_frequenciesOutliers.to_csv('outliersValuesFrequencies_politicalparty.csv', index=False)

Average of each column:
caseid_new                 2.013981e+06
w3_Weight                  7.859000e-01
w3_Weight_LGB              6.555000e-01
w3_combo_weight            8.433237e-01
w3_attrition_adj_weight    6.909198e-01
                               ...     
p20_pppa1905               0.000000e+00
p20_pppa1648               2.000000e+00
p20_ppp20072               5.000000e+00
p20_ppp20071               2.000000e+00
p20_ppp2date2020           2.021053e+07
Length: 725, dtype: float64
