In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Specify the file path
file_path = 'hcmst2017to2022.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Change string numeric values to actual numeric values
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Display the DataFrame
#print(df.columns)

# Specify the column names you want to print
columns_to_print = ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w1_partyid7', 'w1_q12']

# Print the specified columns
print(df[columns_to_print])

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
0          53001           1         2.0         3.0          6.0     1.0
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
3         121759           1         1.0         NaN          2.0     4.0
4         158083           2         6.0         6.0          3.0     2.0
...          ...         ...         ...         ...          ...     ...
3505     2967957           1         NaN         NaN          7.0     6.0
3506     2968357           1         NaN         NaN          3.0     4.0
3507     2968971           1         NaN         NaN          5.0     4.0
3508     2969933           1         NaN         NaN          6.0     7.0
3509     2972135           1         NaN         NaN          3.0     5.0

[3510 rows x 6 columns]


In [10]:
# looking at possible values for each col
print(df['w1_section'].unique())  # possible values for relationship status in wave 1
print(df['w2_section'].unique())  # possible values for relationship status in wave 2
print(df['w3_section'].unique())  # possible values for relationship status in wave 3
print(df['w1_partyid7'].unique()) # possible values for respondent's political party
print(df['w1_q12'].unique()) # possible values for partner's political party


[1 2 3]
[ 2.  1.  6.  3. nan  5.  4.]
[ 3.  1. nan  6.  5.  4.  2.]
[ 6.  3.  7.  2.  1.  5. nan  4.]
[ 1.  3.  7.  4.  2.  6.  5. nan -1.]


In [11]:
# lets start by looking at all the couples that have stayed together through wave 1, 2, and 3

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 


# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5)

# Filter rows based on the combined condition and print the specified columns
filtered_rows = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rows)

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
8         214227           1         1.0         1.0          3.0     6.0
...          ...         ...         ...         ...          ...     ...
3438     2958163           1         1.0         1.0          3.0     3.0
3448     2959699           1         1.0         1.0          5.0     6.0
3459     2961175           1         1.0         1.0          2.0     6.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[1096 rows x 6 columns]


In [12]:
# let's look at homophily in couples based on sharing the same political ideology or leaning
# the following is for those who have the same exact 'score' on the survey's political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
condition6 = df['w1_partyid7'] == df['w1_q12'] # same political leaning 'score'

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & condition6

# Filter rows based on the combined condition and print the specified columns
filtered_rowsHomophily = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsHomophily)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
9         218351           1         1.0         1.0          2.0     2.0
...          ...         ...         ...         ...          ...     ...
3338     2943301           1         3.0         3.0          5.0     5.0
3402     2951689           1         1.0         1.0          7.0     7.0
3438     2958163           1         1.0         1.0          3.0     3.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[408 rows x 6 columns]


In [13]:
# from above - 408 (near half) of the 1096 couples that stayed together through the 3 waves scored the exact same

# so now let's look at couples who are the furthest away from each other on the political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 

# filtering for political party affiliation 2 steps away
condition6 = (df['w1_partyid7'] - df['w1_q12']) == 2
# and vice versa
condition7 = (df['w1_q12'] - df['w1_partyid7']) == 2

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & (condition6 | condition7)

# Filter rows based on the combined condition and print the specified columns
filtered_rowsOutliers = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsOutliers)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
10        220655           1         1.0         1.0          6.0     4.0
46        643423           1         1.0         1.0          3.0     5.0
49        646023           1         1.0         1.0          3.0     1.0
71        698117           1         1.0         1.0          2.0     4.0
73        704661           1         1.0         1.0          6.0     4.0
...          ...         ...         ...         ...          ...     ...
3044     2882059           1         1.0         1.0          3.0     1.0
3045     2882207           1         1.0         1.0          1.0     3.0
3111     2897303           1         1.0         1.0          3.0     5.0
3252     2926595           1         1.0         1.0          1.0     3.0
3259     2928317           1         1.0         1.0          5.0     3.0

[231 rows x 6 columns]


In [14]:
print(filtered_rowsOutliers.shape[0])
outlierArr = filtered_rowsOutliers['caseid_new'].values
print(np.array2string(outlierArr, separator=','))

231
[ 220655, 643423, 646023, 698117, 704661, 709173, 726183, 729319, 731629,
  755761, 769207, 775045, 784141, 792153, 806423, 821909, 822315, 842437,
  844865, 876181, 985963,1002587,1049013,1097625,1105539,1116971,1127787,
 1130879,1140491,1150713,1179591,1181125,1226941,1243163,1244455,1251293,
 1276181,1311313,1333997,1335167,1336987,1348777,1376491,1407429,1423457,
 1425571,1435195,1437339,1437691,1439945,1473855,1478863,1552387,1568833,
 1570547,1587583,1604059,1631841,1633263,1635641,1656987,1660265,1682355,
 1697675,1726295,1738617,1760107,1761371,1767375,1788911,1794231,1795553,
 1812501,1828311,1844841,1850681,1852051,1861177,1870439,1874783,1884543,
 1898367,1900989,1910803,1913143,1915085,1920851,1920877,1924649,1925505,
 1926463,1928207,1940691,1952879,1954581,1974861,1978449,1980005,1984375,
 2011465,2014211,2016911,2025287,2025455,2052461,2083123,2084273,2086775,
 2090459,2116717,2118341,2125135,2138067,2141839,2146471,2153601,2163859,
 2168137,2179295,2206813,2214763,2

In [7]:
# here we can see that 231 out of 1096 couples that stayed together through the 3 waves were on 2 steps away on the political spectrum
# let's export to look at the indiv attributes


condition1 = df['caseid_new'].isin(outlierArr)
filtered_rowsOutliersAllCols = df.loc[condition1]
print(filtered_rowsOutliersAllCols)
filtered_rowsOutliersAllCols.to_csv('outliers_politicalparty_1_6.csv', index=False)

      caseid_new  w3_Weight  w3_Weight_LGB  w3_combo_weight  \
10        220655     0.4449            NaN         0.498333   
46        643423     0.6002            NaN         0.672284   
49        646023     1.0451            NaN         1.170617   
71        698117     0.7767            NaN         0.869982   
73        704661     0.7006            NaN         0.784742   
...          ...        ...            ...              ...   
3044     2882059     0.5803            NaN         0.649994   
3045     2882207     0.9092            NaN         1.018395   
3111     2897303     0.9016            NaN         1.009883   
3252     2926595     0.5780            NaN         0.647418   
3259     2928317     1.0726            NaN         1.201420   

      w3_attrition_adj_weight  w2_weight_genpop  w2_weight_LGB  \
10                   0.407055            0.2980            NaN   
46                   0.561683            0.6051            NaN   
49                   0.949547            0.83

In [8]:
column_averagesOutliers = filtered_rowsOutliersAllCols.median()

print("Average of each column:")
print(column_averagesOutliers)
column_averagesOutliers.to_csv('outliersMedianValues_politicalparty.csv', index=False)

#column_frequenciesOutliers = filtered_rowsOutliersAllCols.apply(pd.Series.value_counts)
#column_frequenciesOutliers.to_csv('outliersValuesFrequencies_politicalparty.csv', index=False)

Average of each column:
caseid_new                 2.153601e+06
w3_Weight                  7.804000e-01
w3_Weight_LGB              5.370000e-01
w3_combo_weight            8.268582e-01
w3_attrition_adj_weight    7.014951e-01
                               ...     
p20_pppa1905               0.000000e+00
p20_pppa1648               2.000000e+00
p20_ppp20072               5.000000e+00
p20_ppp20071               2.000000e+00
p20_ppp2date2020           2.021052e+07
Length: 725, dtype: float64
