In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Specify the file path
file_path = 'hcmst2017to2022.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Change string numeric values to actual numeric values
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Display the DataFrame
#print(df.columns)

# Specify the column names you want to print
columns_to_print = ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w1_partyid7', 'w1_q12']

# Print the specified columns
print(df[columns_to_print])

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
0          53001           1         2.0         3.0          6.0     1.0
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
3         121759           1         1.0         NaN          2.0     4.0
4         158083           2         6.0         6.0          3.0     2.0
...          ...         ...         ...         ...          ...     ...
3505     2967957           1         NaN         NaN          7.0     6.0
3506     2968357           1         NaN         NaN          3.0     4.0
3507     2968971           1         NaN         NaN          5.0     4.0
3508     2969933           1         NaN         NaN          6.0     7.0
3509     2972135           1         NaN         NaN          3.0     5.0

[3510 rows x 6 columns]


In [2]:
# looking at possible values for each col
print(df['w1_section'].unique())  # possible values for relationship status in wave 1
print(df['w2_section'].unique())  # possible values for relationship status in wave 2
print(df['w3_section'].unique())  # possible values for relationship status in wave 3
print(df['w1_partyid7'].unique()) # possible values for respondent's political party
print(df['w1_q12'].unique()) # possible values for partner's political party


[1 2 3]
[ 2.  1.  6.  3. nan  5.  4.]
[ 3.  1. nan  6.  5.  4.  2.]
[ 6.  3.  7.  2.  1.  5. nan  4.]
[ 1.  3.  7.  4.  2.  6.  5. nan -1.]


In [22]:
# lets start by looking at all the couples that have stayed together through wave 1, 2, and 3

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 


# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5)

# Filter rows based on the combined condition and print the specified columns
filtered_rowsStayedTogether = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsStayedTogether)

      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
8         214227           1         1.0         1.0          3.0     6.0
...          ...         ...         ...         ...          ...     ...
3438     2958163           1         1.0         1.0          3.0     3.0
3448     2959699           1         1.0         1.0          5.0     6.0
3459     2961175           1         1.0         1.0          2.0     6.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[1096 rows x 6 columns]


In [4]:
# let's look at homophily in couples based on sharing the same political ideology or leaning
# the following is for those who have the same exact 'score' on the survey's political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
condition6 = df['w1_partyid7'] == df['w1_q12'] # same political leaning 'score'

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & condition6

# Filter rows based on the combined condition and print the specified columns
filtered_rowsHomophily = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsHomophily)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
1          71609           1         1.0         1.0          3.0     3.0
2         106983           1         1.0         1.0          7.0     7.0
5         164061           1         1.0         1.0          2.0     2.0
7         212249           1         1.0         1.0          1.0     1.0
9         218351           1         1.0         1.0          2.0     2.0
...          ...         ...         ...         ...          ...     ...
3338     2943301           1         3.0         3.0          5.0     5.0
3402     2951689           1         1.0         1.0          7.0     7.0
3438     2958163           1         1.0         1.0          3.0     3.0
3460     2961203           1         1.0         1.0          1.0     1.0
3486     2964987           1         1.0         1.0          1.0     1.0

[408 rows x 6 columns]


In [13]:
# from above - 408 (near half) of the 1096 couples that stayed together through the 3 waves scored the exact same

# so now let's look at couples who are the furthest away from each other on the political spectrum

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 

# filtering for political party affiliation 1 step away
condition6 = df['w1_partyid7'] == -1
# and vice versa
condition7 = df['w1_q12'] == -1

# Combine conditions using boolean AND (&)
combined_condition = condition1 & (condition2 | condition3) & (condition4 | condition5) & (condition6 | condition7)

# Filter rows based on the combined condition and print the specified columns
filtered_rowsOutliers = df.loc[combined_condition, columns_to_print]

# Print the filtered rows
print(filtered_rowsOutliers)


      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
783      1765719           1         3.0         3.0          4.0    -1.0
823      1792531           1         3.0         3.0          4.0    -1.0
994      1891491           1         1.0         1.0          3.0    -1.0
1117     1938115           1         1.0         1.0          2.0    -1.0
1604     2210355           1         1.0         1.0          2.0    -1.0
1805     2302771           1         1.0         1.0          4.0    -1.0
2412     2642961           1         1.0         1.0          7.0    -1.0
2429     2647841           1         1.0         1.0          1.0    -1.0


In [14]:
print(filtered_rowsOutliers.shape[0])
outlierArr = filtered_rowsOutliers['caseid_new'].values
print(outlierArr)
print(np.array2string(outlierArr, separator=','))

8
[1765719 1792531 1891491 1938115 2210355 2302771 2642961 2647841]
[1765719,1792531,1891491,1938115,2210355,2302771,2642961,2647841]


In [11]:
# here we can see that 323 out of 1096 couples that stayed together through the 3 waves were on 2 steps away on the political spectrum
# let's export to look at the indiv attributes


condition1 = df['caseid_new'].isin(outlierArr)
filtered_rowsOutliersAllCols = df.loc[condition1]
print(filtered_rowsOutliersAllCols)
filtered_rowsOutliersAllCols.to_csv('outliers_politicalparty_1_6.csv', index=False)

Empty DataFrame
Columns: [caseid_new, w3_Weight, w3_Weight_LGB, w3_combo_weight, w3_attrition_adj_weight, w2_weight_genpop, w2_weight_LGB, w2_combo_weight, w2_attrition_adj_weights, w1_weight_combo, w1_weight_combo_freqwt, w3_xpartner_type, w3_xlast_contact_year, w3_xcohab, w3_xsamesex, w3_xlast_contact_mo, w3_xnamep_present, w3_duration, w3_gen_pop_sample, w3_ppage, w3_ppagecat, w3_ppagect4, w3_ppeduc, w3_ppeducat, w3_ppethm, w3_ppgender, w3_pphhhead, w3_pphhsize, w3_pphouse, w3_ppincimp, w3_real_inc, w3_log_real_inc, w3_ppmarit, w3_ppmsacat, w3_PPREG4, w3_ppreg9, w3_pprent, w3_PPT01, w3_PPT25, w3_PPT612, w3_total_kids_lt13, w3_PPT1317, w3_PPT18OV, w3_ppwork, w3_surveyed, w3_section, w3_partner_type, w3_married, w3_otherdate, w3_coronavirus_effect_combo, w3_corona_effect_combo_reversed, w3_live_w_partner, w3_same_sex_couple, w3_marriage_dp, w3_mar_dp_why_words, w3_mar_dp_why_len, w3_mar_dp_why_sum, w3_rel_qual, w3_Q23, w3_sex_frequency, w3_weekly_sex_frequency, w3_how_many, w3_how_man

In [12]:
column_averagesOutliers = filtered_rowsOutliersAllCols.median()

print("Average of each column:")
print(column_averagesOutliers)
column_averagesOutliers.to_csv('outliersMedianValues_politicalparty.csv', index=False)

#column_frequenciesOutliers = filtered_rowsOutliersAllCols.apply(pd.Series.value_counts)
#column_frequenciesOutliers.to_csv('outliersValuesFrequencies_politicalparty.csv', index=False)

Average of each column:
caseid_new                NaN
w3_Weight                 NaN
w3_Weight_LGB             NaN
w3_combo_weight           NaN
w3_attrition_adj_weight   NaN
                           ..
p20_pppa1905              NaN
p20_pppa1648              NaN
p20_ppp20072              NaN
p20_ppp20071              NaN
p20_ppp2date2020          NaN
Length: 725, dtype: float64


In [28]:
caseid_408_0steps = np.array([71609, 106983, 164061, 212249, 218351, 369975, 428211, 589881, 608697,
  634833, 643505, 648669, 651349, 654589, 657869, 694739, 705457, 724199,
  726431, 728327, 734689, 774843, 782789, 810569, 812769, 813467, 817367,
  846827, 861113, 882717, 916551, 924575, 957155, 999415,1004329,1011851,
 1033007,1059357,1070983,1073943,1077443,1080635,1088237,1100227,1101051,
 1107709,1115473,1126257,1141793,1151149,1156893,1157187,1161211,1163737,
 1167085,1172171,1177155,1181275,1193135,1218979,1221467,1222679,1225799,
 1230459,1234029,1238075,1258803,1271293,1276051,1284217,1318305,1323365,
 1325417,1349167,1349877,1352199,1371445,1372507,1373785,1374091,1382361,
 1385831,1411245,1420887,1421987,1425299,1430863,1441973,1445543,1449039,
 1466315,1483013,1490477,1492417,1505847,1508733,1518259,1524367,1570485,
 1571537,1575841,1596895,1597821,1621115,1638359,1644835,1658485,1665693,
 1678835,1688409,1689267,1691673,1692763,1707755,1712857,1723297,1727435,
 1736313,1738505,1739387,1740471,1746431,1756667,1757463,1757623,1757785,
 1758551,1762955,1768829,1770495,1774405,1785191,1786077,1786675,1786749,
 1790879,1791149,1791377,1794225,1801065,1806971,1815129,1815571,1823887,
 1824065,1825867,1828001,1831603,1845081,1848983,1851301,1854475,1870403,
 1883037,1885139,1888381,1899449,1899645,1900621,1902067,1902987,1910719,
 1911165,1912917,1914889,1916039,1926353,1930585,1935509,1935873,1942391,
 1943633,1947341,1958229,1958605,1974537,1975919,1979711,1979935,1982563,
 1984819,1988099,1989553,1994249,1996333,2014793,2015057,2018973,2028967,
 2032401,2033003,2043853,2044441,2054225,2054791,2065091,2065295,2072325,
 2074241,2075775,2076645,2089701,2109385,2132357,2142099,2143321,2144859,
 2146521,2150647,2150703,2152283,2155971,2161255,2162891,2163905,2167453,
 2169639,2170187,2174237,2177551,2205751,2207131,2228393,2231079,2232929,
 2237055,2237111,2237691,2243015,2247423,2254741,2259031,2267513,2270949,
 2275209,2283143,2283455,2283991,2287223,2295089,2296453,2296989,2300461,
 2308025,2310793,2312649,2317233,2320991,2325909,2327101,2333153,2336171,
 2344635,2345685,2346013,2346389,2354673,2355501,2356411,2362051,2363917,
 2409947,2411543,2411689,2413081,2413249,2417365,2419259,2419327,2422835,
 2427329,2429297,2430261,2431217,2431691,2436775,2436989,2440127,2441733,
 2443391,2445857,2447595,2449593,2453103,2458389,2458979,2460749,2478473,
 2478601,2483431,2484003,2494315,2530989,2531511,2535467,2535477,2550963,
 2551603,2554257,2555043,2555743,2559479,2564805,2567945,2578951,2583199,
 2587903,2588235,2605123,2606759,2611269,2613043,2630171,2630463,2635321,
 2641193,2644371,2646953,2656603,2664913,2669515,2673921,2679531,2684117,
 2686295,2689423,2689601,2690979,2693307,2699919,2700763,2700937,2702949,
 2705963,2712273,2717509,2718999,2722303,2724951,2729559,2735307,2740129,
 2744495,2749859,2756929,2762099,2763005,2775521,2776317,2777181,2779043,
 2780785,2781391,2785079,2788261,2807193,2808325,2817849,2826185,2826901,
 2838251,2840173,2842301,2842485,2844419,2845847,2849977,2850177,2853477,
 2853837,2854455,2855435,2856247,2856907,2857095,2860697,2866203,2870179,
 2870711,2875975,2876231,2882861,2893053,2898559,2900187,2905009,2905035,
 2908131,2912135,2912503,2914073,2914149,2917523,2923211,2926919,2929765,
 2930211,2930255,2930383,2933033,2934313,2942253,2942743,2943301,2951689,
 2958163,2961203,2964987])

caseid_323_1step = np.array([516823, 632253, 637531, 646157, 650237, 651259, 709075, 727775, 730391,
  731249, 783399, 783679, 811175, 814285, 815831, 854869, 859445, 867567,
  874193, 892115, 900463, 908663, 948417, 959621, 962359, 981251,1000407,
 1001637,1119259,1123417,1129859,1145895,1150579,1168011,1181279,1182387,
 1183381,1196757,1224777,1230191,1234663,1239479,1246677,1246869,1250651,
 1256141,1262965,1267611,1272909,1282581,1283019,1287847,1300409,1301087,
 1344091,1348915,1372419,1374661,1377765,1385783,1394211,1409151,1422915,
 1433383,1445757,1465133,1513043,1513047,1519185,1527623,1548291,1550311,
 1550999,1551703,1572635,1572839,1587197,1593839,1604541,1607815,1626763,
 1630669,1631303,1634959,1638615,1645953,1648699,1677291,1692681,1698683,
 1712517,1718337,1730087,1731569,1761763,1763355,1765905,1768233,1780607,
 1785549,1788973,1791495,1793997,1807189,1814173,1816407,1825937,1831591,
 1834561,1846375,1850929,1851973,1856209,1857323,1858747,1860057,1860737,
 1860835,1869479,1873259,1877643,1879743,1900951,1901639,1903389,1912151,
 1912339,1912879,1914313,1915405,1919359,1926181,1927049,1927273,1929653,
 1938201,1943525,1946509,1956243,1960543,1966321,1969389,1970615,1976539,
 1976605,1976731,1977353,1978599,1978889,1988745,2002441,2014573,2014647,
 2014807,2015377,2017035,2019003,2020911,2021701,2029467,2037837,2053645,
 2071775,2072903,2084743,2086473,2092107,2093615,2102733,2116829,2127023,
 2136629,2141677,2142051,2144713,2145315,2153591,2154415,2154761,2158831,
 2177139,2207753,2211239,2211773,2211779,2212677,2218005,2226649,2239179,
 2247535,2249131,2249677,2255167,2256865,2257097,2264059,2279685,2280829,
 2283987,2296169,2299489,2302735,2306549,2309889,2310821,2312343,2336267,
 2337841,2338241,2339573,2340595,2347263,2352239,2361493,2405443,2409887,
 2417997,2420829,2424537,2425091,2430941,2441833,2442527,2450529,2450675,
 2451027,2452097,2456225,2473663,2483557,2485815,2486709,2489613,2493475,
 2494345,2504621,2506415,2528583,2537029,2560735,2562183,2565397,2573527,
 2582069,2582177,2585983,2587161,2589615,2590055,2590127,2595771,2595819,
 2602801,2613965,2614657,2623489,2629427,2629823,2630805,2638587,2654369,
 2656113,2658325,2658787,2658993,2686395,2686427,2688009,2695851,2699851,
 2702683,2710813,2722059,2729357,2731005,2739675,2746227,2746583,2757001,
 2765841,2772065,2772737,2775795,2777453,2782309,2783229,2785857,2786595,
 2789327,2793267,2807523,2810979,2814059,2816715,2821129,2822251,2829273,
 2829661,2836833,2839127,2839181,2842843,2843515,2849189,2857579,2858443,
 2859539,2867911,2868501,2888089,2893229,2899293,2911331,2922615,2924057,
 2932727,2942729,2943311,2943697,2945353,2946347,2948891,2959699])

caseid_231_2steps = np.array([220655, 643423, 646023, 698117, 704661, 709173, 726183, 729319, 731629,
  755761, 769207, 775045, 784141, 792153, 806423, 821909, 822315, 842437,
  844865, 876181, 985963,1002587,1049013,1097625,1105539,1116971,1127787,
 1130879,1140491,1150713,1179591,1181125,1226941,1243163,1244455,1251293,
 1276181,1311313,1333997,1335167,1336987,1348777,1376491,1407429,1423457,
 1425571,1435195,1437339,1437691,1439945,1473855,1478863,1552387,1568833,
 1570547,1587583,1604059,1631841,1633263,1635641,1656987,1660265,1682355,
 1697675,1726295,1738617,1760107,1761371,1767375,1788911,1794231,1795553,
 1812501,1828311,1844841,1850681,1852051,1861177,1870439,1874783,1884543,
 1898367,1900989,1910803,1913143,1915085,1920851,1920877,1924649,1925505,
 1926463,1928207,1940691,1952879,1954581,1974861,1978449,1980005,1984375,
 2011465,2014211,2016911,2025287,2025455,2052461,2083123,2084273,2086775,
 2090459,2116717,2118341,2125135,2138067,2141839,2146471,2153601,2163859,
 2168137,2179295,2206813,2214763,2216625,2227011,2227745,2228681,2235583,
 2246951,2247021,2255089,2284105,2295955,2296791,2296879,2301575,2302963,
 2305135,2309557,2311713,2314949,2318729,2319615,2328849,2330119,2332637,
 2334663,2338101,2352995,2356133,2401785,2402977,2405521,2408297,2409221,
 2409689,2412127,2430011,2430731,2458037,2461289,2473823,2478365,2478637,
 2483979,2485757,2493771,2504749,2507307,2515925,2519453,2532753,2533723,
 2564393,2576171,2588415,2590343,2591515,2626745,2638829,2641849,2643123,
 2645161,2645965,2647841,2652193,2678879,2679207,2681949,2696891,2713609,
 2714381,2714823,2731849,2733771,2738737,2750953,2754001,2757067,2759363,
 2760291,2761561,2761753,2762767,2763861,2770011,2770541,2776643,2779987,
 2797135,2801077,2804089,2807337,2808641,2809787,2813935,2839815,2845201,
 2846721,2853717,2854889,2854953,2855333,2862779,2866103,2868523,2871939,
 2880739,2882059,2882207,2897303,2926595,2928317])

caseid_77_3steps = np.array([214227, 582849, 587125, 662355, 844189, 855297, 867449, 964713, 981685,
  982323, 983511,1152229,1170887,1193613,1203831,1236545,1269605,1338065,
 1374845,1378293,1418329,1439067,1612001,1679339,1682591,1692049,1718309,
 1734941,1740893,1754559,1761743,1808717,1809325,1835975,1890779,1938115,
 1961301,1966851,2013981,2017641,2027307,2071877,2086637,2111275,2148015,
 2166521,2210355,2227047,2232397,2233103,2271349,2289087,2294693,2423007,
 2440417,2461283,2480385,2555355,2630193,2642425,2649847,2658923,2700049,
 2751707,2765643,2832811,2838801,2845837,2846681,2854677,2893071,2909343,
 2910231,2910943,2917189,2950149,2951535])

caseid_25_4steps = np.array([291177, 621641, 653633, 799683, 894473,1223495,1374925,1673269,1891491,
 1932837,1993993,2171661,2171997,2177193,2293405,2331865,2446617,2658259,
 2759605,2815685,2851445,2866293,2887517,2929957,2961175])

caseid_5_5steps = np.array([497203, 703017,1214063,1253615,1532255,1536983,1760991,1765719,1792531,
 2065339,2125001,2148893,2215215,2219299,2239403,2261673,2265281,2292541,
 2302771,2345215,2687373,2889595])

caseid_9_6steps = np.array([1908725,1991077,2101713,2437991,2634995,2737695,2753941,2774941,2883019])

caseid8_noresp = np.array([1765719,1792531,1891491,1938115,2210355,2302771,2642961,2647841])

conditionCase0Steps = ~df['caseid_new'].isin(caseid_408_0steps)
conditionCase1Step = ~df['caseid_new'].isin(caseid_323_1step)
conditionCase2Steps = ~df['caseid_new'].isin(caseid_231_2steps)
conditionCase3Steps = ~df['caseid_new'].isin(caseid_77_3steps)
condition4Steps = ~df['caseid_new'].isin(caseid_25_4steps)
condition5Steps = ~df['caseid_new'].isin(caseid_5_5steps)
condition6Steps = ~df['caseid_new'].isin(caseid_9_6steps)
conditionNoResp = ~df['caseid_new'].isin(caseid8_noresp)

condition1 = df['w1_section'] == 1 # those who were partenered in 2017
condition2 = df['w2_section'] == 1 # who are still married to the same partner in wave 2
condition3 = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
condition4 = df['w3_section'] == 1 # who are still married to the same partner in wave 3
condition5 = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2

combined_condition2 = condition1 & (condition2 | condition3) & (condition4 | condition5) & (conditionCase0Steps & conditionCase1Step & conditionCase2Steps & conditionCase3Steps & condition4Steps & 
                          condition5Steps & condition6Steps)

# Filter rows based on the combined condition and print the specified columns
arrNotCaught = df.loc[combined_condition2, ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w1_partyid7', 'w1_q12']]

# Print the filtered rows
print(arrNotCaught)
print(arrNotCaught.shape[0])




      caseid_new  w1_section  w2_section  w3_section  w1_partyid7  w1_q12
2412     2642961           1         1.0         1.0          7.0    -1.0
1
