In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Specify the file path
file_path = 'hcmst2017to2022.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Change string numeric values to actual numeric values
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Display the DataFrame
#print(df.columns)

# Specify the column names you want to print
columns_to_print_section = ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w3_partner_source']

# Print the specified columns
print(df[columns_to_print_section].shape)

(3510, 5)


In [41]:
# all the people that were in one relationship throughout all w1->w3
con_w1_partnered = df['w1_section'] == 1 # those who were partenered in 2017
con_w2_married_same = df['w2_section'] == 1 # who are still married to the same partner in wave 2
con_w2_part_same = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
con_w3_married_same = df['w3_section'] == 1 # who are still married to the same partner in wave 3
con_w3_partnered_same = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
con_w3_same_part_fromw1 = df['w3_partner_source'] == 1

com_con_same_rel_all_waves = con_w1_partnered & (con_w2_married_same | con_w2_part_same) & ((con_w3_married_same | con_w3_partnered_same) & con_w3_same_part_fromw1)
same_rel_all_waves_df = df.loc[com_con_same_rel_all_waves]
print(len(same_rel_all_waves_df))

1096


In [42]:
# now creating dict 3 for those who had same rel w1->w3
rel_counter = 0
dict_same_rel = {}

df_dict_same_rel = same_rel_all_waves_df.to_dict(orient='records')

for record in df_dict_same_rel:
    dict_same_rel[rel_counter] = {
        'caseid_new': record['caseid_new'], #resp case id
        'w1_ppgender': record['w1_ppgender'],#resp gender
        'w1_same_sex_couple': record['w1_same_sex_couple'], #same sex couple? 0 no, 1 yes,
        'w1_ppage': record['w1_ppage'], #age
        'w1_q9': record['w1_q9'], #partner's age in 2017

        'w1_subject_race': record['w1_subject_race'], #resp race
        'w1_q6b': record['w1_q6b'], #partner's race
                
        'w1_ppeduc': record['w1_ppeduc'], #resp educ level
        'w1_q10': record['w1_q10'], #partner educ level

        'w1_partyid7':record['w1_partyid7'],
        'w1_q12':record['w1_q12'],
        'political_steps': np.abs(record['w1_partyid7']-record['w1_q12']),
        "rel_duration": record['w3_relationship_duration_yrs']
            }
    rel_counter += 1

print(dict_same_rel)
#print(len(dict_same_rel))



{0: {'caseid_new': 71609, 'w1_ppgender': 2, 'w1_same_sex_couple': 0.0, 'w1_ppage': 68, 'w1_q9': 71.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 10.0, 'w1_partyid7': 3.0, 'w1_q12': 3.0, 'political_steps': 0.0, 'rel_duration': 57.4166679382324}, 1: {'caseid_new': 106983, 'w1_ppgender': 1, 'w1_same_sex_couple': 0.0, 'w1_ppage': 39, 'w1_q9': 49.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 11, 'w1_q10': 10.0, 'w1_partyid7': 7.0, 'w1_q12': 7.0, 'political_steps': 0.0, 'rel_duration': 22.3333339691162}, 2: {'caseid_new': 164061, 'w1_ppgender': 1, 'w1_same_sex_couple': 0.0, 'w1_ppage': 59, 'w1_q9': 52.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 12.0, 'w1_partyid7': 2.0, 'w1_q12': 2.0, 'political_steps': 0.0, 'rel_duration': 28.25}, 3: {'caseid_new': 212249, 'w1_ppgender': 2, 'w1_same_sex_couple': 0.0, 'w1_ppage': 55, 'w1_q9': 55.0, 'w1_subject_race': 2.0, 'w1_q6b': 2.0, 'w1_ppeduc': 9, 'w1_q10': 10.0, 'w1_partyid7': 1.0, 'w1_q12': 1.0,

In [43]:
# partner dict 
partner_info = {}

rec_counter = 1096

# 1096 couples
for record in df_dict_same_rel:
    partner_info [rec_counter] = {
        "r_caseid":record["caseid_new"],
        "r_gender": record["w1_ppgender"], # resp gender
        "p_gender": record["w1_q4"], # partner's gender
        
        "r_race": record["w1_ppethm"], # resp race
        "p_race": record["w1_q6b"], #partner's race
        
        "r_age": record["w1_ppage"], # resp age
        "p_age": record["w1_q9"], #partner age in 2017,
        
        "r_edu": record["w1_ppeduc"], # resp education
        "p_edu": record["w1_q10"], # partner education
        
        "r_politic": record["w1_partyid7"], # resp political affiliation
        "p_politic": record["w1_q12"] # partner political affiliation
        
    }
    rec_counter +=1

print(partner_info)

#print(rec_counter)

{1096: {'r_caseid': 71609, 'r_gender': 2, 'p_gender': 1.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 68, 'p_age': 71.0, 'r_edu': 10, 'p_edu': 10.0, 'r_politic': 3.0, 'p_politic': 3.0}, 1097: {'r_caseid': 106983, 'r_gender': 1, 'p_gender': 2.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 39, 'p_age': 49.0, 'r_edu': 11, 'p_edu': 10.0, 'r_politic': 7.0, 'p_politic': 7.0}, 1098: {'r_caseid': 164061, 'r_gender': 1, 'p_gender': 2.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 59, 'p_age': 52.0, 'r_edu': 10, 'p_edu': 12.0, 'r_politic': 2.0, 'p_politic': 2.0}, 1099: {'r_caseid': 212249, 'r_gender': 2, 'p_gender': 1.0, 'r_race': 2, 'p_race': 2.0, 'r_age': 55, 'p_age': 55.0, 'r_edu': 9, 'p_edu': 10.0, 'r_politic': 1.0, 'p_politic': 1.0}, 1100: {'r_caseid': 214227, 'r_gender': 2, 'p_gender': 1.0, 'r_race': 1, 'p_race': 2.0, 'r_age': 73, 'p_age': 79.0, 'r_edu': 9, 'p_edu': 9.0, 'r_politic': 3.0, 'p_politic': 6.0}, 1101: {'r_caseid': 218351, 'r_gender': 1, 'p_gender': 2.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 46, 'p_

In [44]:
# partner dict - same as above but just the key-names differ
partner_info_dict = {}

rec_counter = 1096

# 1096 couples
for record in df_dict_same_rel:
    partner_info_dict [rec_counter] = {
        "caseid_new":record['caseid_new'],
        "w1_ppgender": record["w1_ppgender"], # resp gender
        "w1_q4": record["w1_q4"], # partner's gender

        "w1_ppage": record["w1_ppage"], # resp age
        "w1_q9": record["w1_q9"], #partner age in 2017,
        
        'w1_subject_race': record['w1_subject_race'], # resp race
        "w1_q6b": record["w1_q6b"], #partner's race
        
        "w1_ppeduc": record["w1_ppeduc"], # resp education
        "w1_q10": record["w1_q10"], # partner education
        
        "w1_partyid7": record["w1_partyid7"], # resp political affiliation
        "w1_q12": record["w1_q12"] # partner political affiliation
        
    }
    rec_counter +=1

print(partner_info_dict)

#print(rec_counter)

{1096: {'caseid_new': 71609, 'w1_ppgender': 2, 'w1_q4': 1.0, 'w1_ppage': 68, 'w1_q9': 71.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 10.0, 'w1_partyid7': 3.0, 'w1_q12': 3.0}, 1097: {'caseid_new': 106983, 'w1_ppgender': 1, 'w1_q4': 2.0, 'w1_ppage': 39, 'w1_q9': 49.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 11, 'w1_q10': 10.0, 'w1_partyid7': 7.0, 'w1_q12': 7.0}, 1098: {'caseid_new': 164061, 'w1_ppgender': 1, 'w1_q4': 2.0, 'w1_ppage': 59, 'w1_q9': 52.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 12.0, 'w1_partyid7': 2.0, 'w1_q12': 2.0}, 1099: {'caseid_new': 212249, 'w1_ppgender': 2, 'w1_q4': 1.0, 'w1_ppage': 55, 'w1_q9': 55.0, 'w1_subject_race': 2.0, 'w1_q6b': 2.0, 'w1_ppeduc': 9, 'w1_q10': 10.0, 'w1_partyid7': 1.0, 'w1_q12': 1.0}, 1100: {'caseid_new': 214227, 'w1_ppgender': 2, 'w1_q4': 1.0, 'w1_ppage': 73, 'w1_q9': 79.0, 'w1_subject_race': 1.0, 'w1_q6b': 2.0, 'w1_ppeduc': 9, 'w1_q10': 9.0, 'w1_partyid7': 3.0, 'w1_q12': 6.0}, 110

In [45]:
import json

partner_info_dict_no_nan = partner_info_dict

for record in partner_info_dict_no_nan:
    if np.isnan((partner_info_dict_no_nan[record])[ "w1_subject_race"]):
        (partner_info_dict_no_nan[record])["w1_subject_race"] = -2


def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(item) for item in obj]
    elif isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, tuple):
        return tuple(make_json_serializable(item) for item in obj)
    elif isinstance(obj, (str, int, float, bool, type(None))):
        return obj
    elif hasattr(obj, '__dict__'):
        return make_json_serializable(obj.__dict__)
    else:
        return str(obj)  # Or handle non-serializable types as needed

# Convert dictionary to JSON-serializable format
partner_info_dict_json = make_json_serializable(partner_info_dict_no_nan)

with open("partner_info_dict_json.json", "w") as json_file:
    json.dump(partner_info_dict_json, json_file, indent=4)


In [46]:
#Generalizing what attributes take precedence 
# so when we match individual person to another person, if there is a same score for 2 different attributes, we consider this generic precedence algo 
#also checked from discord stat: 408+323+231

#if you are politically distant from your partner by 0-2, then add 1 to the counter (You are very similar)
political_ideology=0

for record in dict_same_rel:
    each_record = dict_same_rel[record]
    political_steps = each_record['political_steps']
    if (int(political_steps)>=0 and int(political_steps)<=2):
        political_ideology+=1

print(political_ideology)

962


In [47]:
#prefer same ethnicity 

same_ethnicity_number=0

for record in dict_same_rel:
    each_record = dict_same_rel[record]
    resp_eth=each_record['w1_subject_race']
    partner_eth=each_record['w1_q6b']
    if (resp_eth==partner_eth):
        same_ethnicity_number+=1

print(same_ethnicity_number)
        

947


In [48]:
#Age - 791 couples had an avg difference of 5

In [49]:
#doing the same for partner's education being greater than the person

partner_ed_greater_than_or_equal=0

for record in dict_same_rel:
    each_record = dict_same_rel[record]
    person_ed = each_record['w1_ppeduc']
    partner_ed = each_record['w1_q10']
    if (partner_ed>=person_ed):
        partner_ed_greater_than_or_equal+=1

print(partner_ed_greater_than_or_equal)

736


In [50]:
#########################################################################
###############  RANKING ALGO ##########################

In [51]:
#962 couples who lasted through all the waves have an avg age gap of 5 
#since it's a tie-breaker with political_ideology, we will look at relationship quality of couples who rated their partners excellent and good and see 
#what proportion of them prefer politics over age and vice-versa 
# depending on that, we will rank them 

#MOST IMPORTANT CELL - RANKED ALL ATTRIBUTES

married_adults1 = df['w1_partnership_status']==1
partnered_adults1 = df['w1_partnership_status']==2

married_adults_samep2 = df['w2_section']==1
partnered_adults_samep2 = df['w2_section']==3

married_adults_samep3 = df['w3_section']==1
partnered_adults_samep3 = df['w3_section']==3

relationship_quality_excellent = df['w1_q34']==1
relationship_quality_good = df['w1_q34']==2

caseids = ['caseid_new', 'w1_partyid7','w1_q12', 'w1_ppage', 'w1_q9', 'w1_subject_race','w1_q6b', 'w1_ppeduc', 'w1_q10' ]

cond = (married_adults1 | partnered_adults1) & (married_adults_samep2 | partnered_adults_samep2) & (married_adults_samep3 | partnered_adults_samep3) & (relationship_quality_excellent | relationship_quality_good)

relationship_quality_list = df.loc[cond,caseids]

relationship_quality_dict = relationship_quality_list.to_dict(orient='records')

political_ideology=0
for record in relationship_quality_dict:
    if (record['w1_partyid7']>=1 and record['w1_q12']>=1):
        political_steps = np.abs(record['w1_partyid7']-record['w1_q12'])
        if (int(political_steps)>=0 and int(political_steps)<=2):
            political_ideology+=1

age_diff_couples=0
for record in relationship_quality_dict:
    if (record['w1_ppage']>=1 and record['w1_q9']>=1):
        age_diff = np.abs(record['w1_ppage']-record['w1_q9'])
        if (age_diff>=0 and age_diff<=5):
            age_diff_couples+=1

same_ethnicity_number=0
for record in relationship_quality_dict:
    resp_eth=record['w1_subject_race']
    partner_eth=record['w1_q6b']
    if (resp_eth>=1 and partner_eth>=1):
        if (resp_eth==partner_eth):
            same_ethnicity_number+=1

partner_ed_greater_than_or_equal=0
for record in relationship_quality_dict:
    person_ed = record['w1_ppeduc']
    partner_ed = record['w1_q10']
    if (person_ed>=1 and partner_ed>=1):
        if (partner_ed>=person_ed):
            partner_ed_greater_than_or_equal+=1


print("Considering Relationship quality")
print("Number of people who have similar political ideology:",political_ideology)
print("Number of people whose ethnicity is similar:",same_ethnicity_number)
print("Number of people who have avg age difference of 5:",age_diff_couples)
print("Number of people whose partner's education is more than them:", partner_ed_greater_than_or_equal)



Considering Relationship quality
Number of people who have similar political ideology: 906
Number of people whose ethnicity is similar: 896
Number of people who have avg age difference of 5: 791
Number of people whose partner's education is more than them: 699


In [52]:
########################################################################################
##########  SCORING ALGORITHM ######################################################

In [53]:
#functions to calculate scores for individual attributes 

def cal_pol_steps(a,b):
    if (a>=1 and b>=1):
        return np.abs(np.abs(int(a)-int(b))-6)
    else:
        return 0

def cal_age_diff(r_age,p_age):
    score_age=0
    if (r_age>=1 and p_age>=1):
        age_diff = np.abs(r_age-p_age)
        if (age_diff>=2 and age_diff<=5):
            score_age=1
            return score_age
    return score_age
            

def check_eth_same(a,b):
    same_eth=0
    if (a>=1 and b>=1):
        if (a==b):
            same_eth=1
            return same_eth
    return same_eth
    
def check_p_ed_greater(r_edu,p_edu):
    if (r_edu>=1 and p_edu>=1):    
        score_edu = np.abs(np.abs(r_edu-p_edu)-13)
        return score_edu
    else:
        return 0

In [54]:
#scoring existing resp-partner relationship
#individual scores for each attribute 

score_resp_partner={}

for record in partner_info:
    p_info=partner_info[record]

    #political steps difference -6 (6 is the max diff that exists in politics)
    score_political_steps=cal_pol_steps( int(p_info['r_politic']),int(p_info['p_politic']) )

    #if races are the same, assign a binary value of 1(true) or 0(false)
    score_race=check_eth_same(int(p_info['r_race']) , int(p_info['p_race']))

    #education difference - 13 (13 is the max difference that exists in the educational level)
    score_edu=check_p_ed_greater(int(p_info['r_edu']) , int(p_info['p_edu']))

    #age difference (if age gap within 5, binary value of 1. If not, 0)
    score_age=cal_age_diff(int(p_info['r_age']) , int(p_info['p_age']))

    total_score = score_political_steps+score_race+score_edu+score_age
    
    score_resp_partner[p_info['r_caseid']]={
        "p_id":record,
        "score_political_steps":score_political_steps,
        "score_race":score_race,
        "score_age":score_age,
        "score_edu":score_edu,
        "total_score":total_score

    }
print(score_resp_partner)

#check if any value exceeds the max range set
counter=0
for record in score_resp_partner.values():
    if (record['score_edu']>13):
        counter+=1
#print(counter)

    


{71609: {'p_id': 1096, 'score_political_steps': 6, 'score_race': 1, 'score_age': 1, 'score_edu': 13, 'total_score': 21}, 106983: {'p_id': 1097, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 19}, 164061: {'p_id': 1098, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 11, 'total_score': 18}, 212249: {'p_id': 1099, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 19}, 214227: {'p_id': 1100, 'score_political_steps': 3, 'score_race': 0, 'score_age': 0, 'score_edu': 13, 'total_score': 16}, 218351: {'p_id': 1101, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 19}, 220655: {'p_id': 1102, 'score_political_steps': 4, 'score_race': 0, 'score_age': 0, 'score_edu': 11, 'total_score': 15}, 291177: {'p_id': 1103, 'score_political_steps': 2, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 15}, 369975: {'p_id': 1104, 'score_po

In [55]:
#total score for each resp-partner relationship (ORIGINAL GRAPH)
#score= political_steps+race+education

total_score_rp = {}

for record in score_resp_partner:
    p_info=score_resp_partner[record]
    total_score = p_info['score_political_steps']+p_info['score_race']+p_info['score_edu']+p_info['score_age']

    total_score_rp[record]={
        "p_id":p_info['p_id'],
        "total_score":total_score
    }

print(total_score_rp)
counter=0
for key, values in total_score_rp.items():
    total_score=total_score_rp[key]['total_score']
    if (total_score>=21):
        counter+=1
print(counter)

{71609: {'p_id': 1096, 'total_score': 21}, 106983: {'p_id': 1097, 'total_score': 19}, 164061: {'p_id': 1098, 'total_score': 18}, 212249: {'p_id': 1099, 'total_score': 19}, 214227: {'p_id': 1100, 'total_score': 16}, 218351: {'p_id': 1101, 'total_score': 19}, 220655: {'p_id': 1102, 'total_score': 15}, 291177: {'p_id': 1103, 'total_score': 15}, 369975: {'p_id': 1104, 'total_score': 19}, 428211: {'p_id': 1105, 'total_score': 17}, 497203: {'p_id': 1106, 'total_score': 14}, 516823: {'p_id': 1107, 'total_score': 20}, 582849: {'p_id': 1108, 'total_score': 18}, 587125: {'p_id': 1109, 'total_score': 17}, 589881: {'p_id': 1110, 'total_score': 19}, 608697: {'p_id': 1111, 'total_score': 20}, 621641: {'p_id': 1112, 'total_score': 17}, 632253: {'p_id': 1113, 'total_score': 17}, 634833: {'p_id': 1114, 'total_score': 18}, 637531: {'p_id': 1115, 'total_score': 17}, 643423: {'p_id': 1116, 'total_score': 19}, 643505: {'p_id': 1117, 'total_score': 18}, 646023: {'p_id': 1118, 'total_score': 19}, 646157: {'p

In [56]:
score_existing_partners ={}  #where existing partner is the key #original graph

for record in score_resp_partner:
    partner_info=score_resp_partner[record]
    score_existing_partners[partner_info['p_id']]={
        'r_id':record,
        'score_political_steps':partner_info['score_political_steps'],
        'score_race':partner_info['score_race'],
        'score_age':partner_info['score_age'],
        'score_edu':partner_info['score_edu'],
        'total_score':partner_info['total_score']
    }

print(score_existing_partners)
#print(len(score_existing_partners))

{1096: {'r_id': 71609, 'score_political_steps': 6, 'score_race': 1, 'score_age': 1, 'score_edu': 13, 'total_score': 21}, 1097: {'r_id': 106983, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 19}, 1098: {'r_id': 164061, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 11, 'total_score': 18}, 1099: {'r_id': 212249, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 19}, 1100: {'r_id': 214227, 'score_political_steps': 3, 'score_race': 0, 'score_age': 0, 'score_edu': 13, 'total_score': 16}, 1101: {'r_id': 218351, 'score_political_steps': 6, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 19}, 1102: {'r_id': 220655, 'score_political_steps': 4, 'score_race': 0, 'score_age': 0, 'score_edu': 11, 'total_score': 15}, 1103: {'r_id': 291177, 'score_political_steps': 2, 'score_race': 1, 'score_age': 0, 'score_edu': 12, 'total_score': 15}, 1104: {'r_id': 369975, 'score_po

In [57]:
# Convert dictionary to JSON-serializable format
score_resp_partner_json = make_json_serializable(score_resp_partner)

with open("score_resp_partner_json.json", "w") as json_file:
    json.dump(score_resp_partner_json, json_file, indent=4)


In [58]:
# Convert dictionary to JSON-serializable format
total_score_rp_json = make_json_serializable(total_score_rp)

with open("total_score_rp_json.json", "w") as json_file:
    json.dump(total_score_rp_json, json_file, indent=4)
