In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Specify the file path
file_path = 'datingdataset.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Change string numeric values to actual numeric values
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Display the DataFrame
#print(df.columns)

# Specify the column names you want to print
columns_to_print_section = ['caseid_new', 'w1_section', 'w2_section', 'w3_section', 'w3_partner_source']

# Print the specified columns
print(df[columns_to_print_section].shape)

(3510, 5)


In [21]:
# all the people that were in one relationship throughout all w1->w3
con_w1_partnered = df['w1_section'] == 1 # those who were partenered in 2017
con_w2_married_same = df['w2_section'] == 1 # who are still married to the same partner in wave 2
con_w2_part_same = df['w2_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
# (note that we are not looking at those with new partners in w2/w3)
con_w3_married_same = df['w3_section'] == 1 # who are still married to the same partner in wave 3
con_w3_partnered_same = df['w3_section'] == 3 # or still partnered with that same partner but not married (but still together) in wave 2 
con_w3_same_part_fromw1 = df['w3_partner_source'] == 1

com_con_same_rel_all_waves = con_w1_partnered & (con_w2_married_same | con_w2_part_same) & ((con_w3_married_same | con_w3_partnered_same) & con_w3_same_part_fromw1)
same_rel_all_waves_df = df.loc[com_con_same_rel_all_waves]
print(len(same_rel_all_waves_df))

1096


In [5]:
# now creating dict 3 for those who had same rel w1->w3
rel_counter = 0
dict_same_rel = {}

df_dict_same_rel = same_rel_all_waves_df.to_dict(orient='records')

for record in df_dict_same_rel:
    dict_same_rel[rel_counter] = {
        'caseid_new': record['caseid_new'], #resp case id
        'w1_ppgender': record['w1_ppgender'],#resp gender
        'w1_same_sex_couple': record['w1_same_sex_couple'], #same sex couple? 0 no, 1 yes,
        'w1_ppage': record['w1_ppage'], #age
        'w1_q9': record['w1_q9'], #partner's age in 2017

        'w1_subject_race': record['w1_subject_race'], #resp race
        'w1_q6b': record['w1_q6b'], #partner's race
                
        'w1_ppeduc': record['w1_ppeduc'], #resp educ level
        'w1_q10': record['w1_q10'], #partner educ level

        'w1_partyid7':record['w1_partyid7'],
        'w1_q12':record['w1_q12'],
        'political_steps': np.abs(record['w1_partyid7']-record['w1_q12']),
        "rel_duration": record['w3_relationship_duration_yrs']
            }
    rel_counter += 1

print(dict_same_rel)
#print(len(dict_same_rel))



{0: {'caseid_new': 71609, 'w1_ppgender': 2, 'w1_same_sex_couple': 0.0, 'w1_ppage': 68, 'w1_q9': 71.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 10.0, 'w1_partyid7': 3.0, 'w1_q12': 3.0, 'political_steps': 0.0, 'rel_duration': 57.4166679382324}, 1: {'caseid_new': 106983, 'w1_ppgender': 1, 'w1_same_sex_couple': 0.0, 'w1_ppage': 39, 'w1_q9': 49.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 11, 'w1_q10': 10.0, 'w1_partyid7': 7.0, 'w1_q12': 7.0, 'political_steps': 0.0, 'rel_duration': 22.3333339691162}, 2: {'caseid_new': 164061, 'w1_ppgender': 1, 'w1_same_sex_couple': 0.0, 'w1_ppage': 59, 'w1_q9': 52.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 12.0, 'w1_partyid7': 2.0, 'w1_q12': 2.0, 'political_steps': 0.0, 'rel_duration': 28.25}, 3: {'caseid_new': 212249, 'w1_ppgender': 2, 'w1_same_sex_couple': 0.0, 'w1_ppage': 55, 'w1_q9': 55.0, 'w1_subject_race': 2.0, 'w1_q6b': 2.0, 'w1_ppeduc': 9, 'w1_q10': 10.0, 'w1_partyid7': 1.0, 'w1_q12': 1.0,

In [6]:
# partner dict 
partner_info = {}

rec_counter = 1096

# 1096 couples
for record in df_dict_same_rel:
    partner_info [rec_counter] = {
        "r_caseid":record["caseid_new"],
        "r_gender": record["w1_ppgender"], # resp gender
        "p_gender": record["w1_q4"], # partner's gender
        
        "r_race": record["w1_ppethm"], # resp race
        "p_race": record["w1_q6b"], #partner's race
        
        "r_age": record["w1_ppage"], # resp age
        "p_age": record["w1_q9"], #partner age in 2017,
        
        "r_edu": record["w1_ppeduc"], # resp education
        "p_edu": record["w1_q10"], # partner education
        
        "r_politic": record["w1_partyid7"], # resp political affiliation
        "p_politic": record["w1_q12"] # partner political affiliation
        
    }
    rec_counter +=1

print(partner_info)

#print(rec_counter)

{1096: {'r_caseid': 71609, 'r_gender': 2, 'p_gender': 1.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 68, 'p_age': 71.0, 'r_edu': 10, 'p_edu': 10.0, 'r_politic': 3.0, 'p_politic': 3.0}, 1097: {'r_caseid': 106983, 'r_gender': 1, 'p_gender': 2.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 39, 'p_age': 49.0, 'r_edu': 11, 'p_edu': 10.0, 'r_politic': 7.0, 'p_politic': 7.0}, 1098: {'r_caseid': 164061, 'r_gender': 1, 'p_gender': 2.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 59, 'p_age': 52.0, 'r_edu': 10, 'p_edu': 12.0, 'r_politic': 2.0, 'p_politic': 2.0}, 1099: {'r_caseid': 212249, 'r_gender': 2, 'p_gender': 1.0, 'r_race': 2, 'p_race': 2.0, 'r_age': 55, 'p_age': 55.0, 'r_edu': 9, 'p_edu': 10.0, 'r_politic': 1.0, 'p_politic': 1.0}, 1100: {'r_caseid': 214227, 'r_gender': 2, 'p_gender': 1.0, 'r_race': 1, 'p_race': 2.0, 'r_age': 73, 'p_age': 79.0, 'r_edu': 9, 'p_edu': 9.0, 'r_politic': 3.0, 'p_politic': 6.0}, 1101: {'r_caseid': 218351, 'r_gender': 1, 'p_gender': 2.0, 'r_race': 1, 'p_race': 1.0, 'r_age': 46, 'p_

In [7]:
# partner dict - same as above but just the key-names differ
partner_info_dict = {}

rec_counter = 1096

# 1096 couples
for record in df_dict_same_rel:
    partner_info_dict [rec_counter] = {
        "caseid_new":record['caseid_new'],
        "w1_ppgender": record["w1_ppgender"], # resp gender
        "w1_q4": record["w1_q4"], # partner's gender

        "w1_ppage": record["w1_ppage"], # resp age
        "w1_q9": record["w1_q9"], #partner age in 2017,
        
        'w1_subject_race': record['w1_subject_race'], # resp race
        "w1_q6b": record["w1_q6b"], #partner's race
        
        "w1_ppeduc": record["w1_ppeduc"], # resp education
        "w1_q10": record["w1_q10"], # partner education
        
        "w1_partyid7": record["w1_partyid7"], # resp political affiliation
        "w1_q12": record["w1_q12"] # partner political affiliation
        
    }
    rec_counter +=1

print(partner_info_dict)

#print(rec_counter)

{1096: {'caseid_new': 71609, 'w1_ppgender': 2, 'w1_q4': 1.0, 'w1_ppage': 68, 'w1_q9': 71.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 10.0, 'w1_partyid7': 3.0, 'w1_q12': 3.0}, 1097: {'caseid_new': 106983, 'w1_ppgender': 1, 'w1_q4': 2.0, 'w1_ppage': 39, 'w1_q9': 49.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 11, 'w1_q10': 10.0, 'w1_partyid7': 7.0, 'w1_q12': 7.0}, 1098: {'caseid_new': 164061, 'w1_ppgender': 1, 'w1_q4': 2.0, 'w1_ppage': 59, 'w1_q9': 52.0, 'w1_subject_race': 1.0, 'w1_q6b': 1.0, 'w1_ppeduc': 10, 'w1_q10': 12.0, 'w1_partyid7': 2.0, 'w1_q12': 2.0}, 1099: {'caseid_new': 212249, 'w1_ppgender': 2, 'w1_q4': 1.0, 'w1_ppage': 55, 'w1_q9': 55.0, 'w1_subject_race': 2.0, 'w1_q6b': 2.0, 'w1_ppeduc': 9, 'w1_q10': 10.0, 'w1_partyid7': 1.0, 'w1_q12': 1.0}, 1100: {'caseid_new': 214227, 'w1_ppgender': 2, 'w1_q4': 1.0, 'w1_ppage': 73, 'w1_q9': 79.0, 'w1_subject_race': 1.0, 'w1_q6b': 2.0, 'w1_ppeduc': 9, 'w1_q10': 9.0, 'w1_partyid7': 3.0, 'w1_q12': 6.0}, 110

In [8]:
#Generalizing what attributes take precedence 
# so when we match individual person to another person, if there is a same score for 2 different attributes, we consider this generic precedence algo 
#also checked from discord stat: 408+323+231

#if you are politically distant from your partner by 0-2, then add 1 to the counter (You are very similar)
political_ideology=0

for record in dict_same_rel:
    each_record = dict_same_rel[record]
    political_steps = each_record['political_steps']
    if (int(political_steps)>=0 and int(political_steps)<=2):
        political_ideology+=1

print(political_ideology)

962


In [9]:
#prefer same ethnicity 

same_ethnicity_number=0

for record in dict_same_rel:
    each_record = dict_same_rel[record]
    resp_eth=each_record['w1_subject_race']
    partner_eth=each_record['w1_q6b']
    if (resp_eth==partner_eth):
        same_ethnicity_number+=1

print(same_ethnicity_number)
        

947


In [None]:
#Age - 791 couples had an avg difference of 5

In [182]:
#doing the same for partner's education being greater than the person

partner_ed_greater_than_or_equal=0

for record in dict_same_rel:
    each_record = dict_same_rel[record]
    person_ed = each_record['w1_ppeduc']
    partner_ed = each_record['w1_q10']
    if (partner_ed>=person_ed):
        partner_ed_greater_than_or_equal+=1

print(partner_ed_greater_than_or_equal)

736


In [10]:
#962 couples who lasted through all the waves have an avg age gap of 5 
#since it's a tie-breaker with political_ideology, we will look at relationship quality of couples who rated their partners excellent and good and see 
#what proportion of them prefer politics over age and vice-versa 
# depending on that, we will rank them 

#MOST IMPORTANT CELL - RANKED ALL ATTRIBUTES

married_adults1 = df['w1_partnership_status']==1
partnered_adults1 = df['w1_partnership_status']==2

married_adults_samep2 = df['w2_section']==1
partnered_adults_samep2 = df['w2_section']==3

married_adults_samep3 = df['w3_section']==1
partnered_adults_samep3 = df['w3_section']==3

relationship_quality_excellent = df['w1_q34']==1
relationship_quality_good = df['w1_q34']==2

caseids = ['caseid_new', 'w1_partyid7','w1_q12', 'w1_ppage', 'w1_q9', 'w1_subject_race','w1_q6b', 'w1_ppeduc', 'w1_q10' ]

cond = (married_adults1 | partnered_adults1) & (married_adults_samep2 | partnered_adults_samep2) & (married_adults_samep3 | partnered_adults_samep3) & (relationship_quality_excellent | relationship_quality_good)

relationship_quality_list = df.loc[cond,caseids]

relationship_quality_dict = relationship_quality_list.to_dict(orient='records')

political_ideology=0
for record in relationship_quality_dict:
    if (record['w1_partyid7']>=1 and record['w1_q12']>=1):
        political_steps = np.abs(record['w1_partyid7']-record['w1_q12'])
        if (int(political_steps)>=0 and int(political_steps)<=2):
            political_ideology+=1

age_diff_couples=0
for record in relationship_quality_dict:
    if (record['w1_ppage']>=1 and record['w1_q9']>=1):
        age_diff = np.abs(record['w1_ppage']-record['w1_q9'])
        if (age_diff>=0 and age_diff<=5):
            age_diff_couples+=1

same_ethnicity_number=0
for record in relationship_quality_dict:
    resp_eth=record['w1_subject_race']
    partner_eth=record['w1_q6b']
    if (resp_eth>=1 and partner_eth>=1):
        if (resp_eth==partner_eth):
            same_ethnicity_number+=1

partner_ed_greater_than_or_equal=0
for record in relationship_quality_dict:
    person_ed = record['w1_ppeduc']
    partner_ed = record['w1_q10']
    if (person_ed>=1 and partner_ed>=1):
        if (np.abs(partner_ed-person_ed)>=0 and np.abs(partner_ed-person_ed)<=1):
            partner_ed_greater_than_or_equal+=1


print("Considering Relationship quality")
print("Number of people who have similar political ideology:",political_ideology)
print("Number of people whose ethnicity is similar:",same_ethnicity_number)
print("Number of people who have avg age difference of 5:",age_diff_couples)
print("Number of people whose partner's education is more than them:", partner_ed_greater_than_or_equal)



Considering Relationship quality
Number of people who have similar political ideology: 906
Number of people whose ethnicity is similar: 896
Number of people who have avg age difference of 5: 791
Number of people whose partner's education is more than them: 701
