In [6]:
import pandas as pd

file="datingdataset.csv"

df= pd.read_csv(file, low_memory= False)

for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')



In [7]:
married_adults1 = df['w1_partnership_status']==1
partnered_adults1 = df['w1_partnership_status']==2

married_adults_samep2 = df['w2_section']==1
partnered_adults_samep2 = df['w2_section']==3

married_adults_samep3 = df['w3_section']==1
partnered_adults_samep3 = df['w3_section']==3

caseids = ['caseid_new', 'w3_ppage','w1_q9']

cond = (married_adults1 | partnered_adults1) & (married_adults_samep2 | partnered_adults_samep2) & (married_adults_samep3 | partnered_adults_samep3)

records = df.loc[cond,caseids]
print(records)

print("Number of married/partnered adults:",len(records))


      caseid_new  w3_ppage  w1_q9
1          71609      72.0   71.0
2         106983      43.0   49.0
5         164061      64.0   52.0
7         212249      60.0   55.0
8         214227      78.0   79.0
...          ...       ...    ...
3438     2958163      54.0   49.0
3448     2959699      70.0   52.0
3459     2961175      54.0   58.0
3460     2961203      72.0   65.0
3486     2964987      29.0   28.0

[1096 rows x 3 columns]
Number of married/partnered adults: 1096


In [8]:
stayed_together_df = df[cond]

def count_couples_by_age_range(age_start, age_end):
    # Apply age filter
    age_filtered_df = stayed_together_df[(stayed_together_df['w3_ppage'] >= age_start) & (stayed_together_df['w3_ppage'] <= age_end)]
    return len(age_filtered_df)

def calculate_average_age_difference(age_start, age_end):
    age_filtered_df = stayed_together_df[(stayed_together_df['w3_ppage'] >= age_start) & (stayed_together_df['w3_ppage'] <= age_end)].copy()

    # Calculate partner's estimated age in wave 3 and the absolute age difference to avoid negative numbers
    partner_age_w3 = age_filtered_df['w1_q9'] + 5
    age_difference = abs(age_filtered_df['w3_ppage'] - partner_age_w3)
    calculated_columns = pd.concat([partner_age_w3.rename('partner_age_w3'), age_difference.rename('age_difference')], axis=1)
    age_filtered_df = pd.concat([age_filtered_df, calculated_columns], axis=1)

    # Now calculate the average age difference
    average_age_difference = round(age_filtered_df['age_difference'].mean(), 1) if not age_filtered_df.empty else None

    return average_age_difference

# Print total count
print(f"Total number of couples that stayed together through all waves: {len(stayed_together_df)}")

# Calculate and print the number of couples and average age difference for each age range
age_ranges = [(18, 29), (30, 39), (40, 49), (50, 59), (60, 69), (70, 79), (80, 100)]
for age_start, age_end in age_ranges:
    count_age_filtered = count_couples_by_age_range(age_start, age_end)
    print(f'Number of couples that stayed together through all waves between {age_start}-{age_end} years old: {count_age_filtered}')

    average_age_difference = calculate_average_age_difference(age_start, age_end)
    if average_age_difference is not None:
        print(f'Average age difference (in years) for couples that stayed together all waves between {age_start}-{age_end} years old: {average_age_difference}')
    else:
        print(f'Not enough data to calculate average age difference for age range {age_start}-{age_end}')


Total number of couples that stayed together through all waves: 1096
Number of couples that stayed together through all waves between 18-29 years old: 22
Average age difference (in years) for couples that stayed together all waves between 18-29 years old: 4.0
Number of couples that stayed together through all waves between 30-39 years old: 134
Average age difference (in years) for couples that stayed together all waves between 30-39 years old: 2.7
Number of couples that stayed together through all waves between 40-49 years old: 171
Average age difference (in years) for couples that stayed together all waves between 40-49 years old: 4.7
Number of couples that stayed together through all waves between 50-59 years old: 224
Average age difference (in years) for couples that stayed together all waves between 50-59 years old: 4.6
Number of couples that stayed together through all waves between 60-69 years old: 308
Average age difference (in years) for couples that stayed together all waves b

In [9]:
partner_age_w3 = df['w1_q9'] + 5
age_difference = abs(df['w3_ppage'] - partner_age_w3)

df = pd.concat([df, partner_age_w3.rename('partner_age_w3'), age_difference.rename('age_difference')], axis=1)

caseids = ['caseid_new', 'w3_ppage', 'partner_age_w3', 'age_difference']

records = df.loc[cond, caseids]
print(records)

      caseid_new  w3_ppage  partner_age_w3  age_difference
1          71609      72.0            76.0             4.0
2         106983      43.0            54.0            11.0
5         164061      64.0            57.0             7.0
7         212249      60.0            60.0             0.0
8         214227      78.0            84.0             6.0
...          ...       ...             ...             ...
3438     2958163      54.0            54.0             0.0
3448     2959699      70.0            57.0            13.0
3459     2961175      54.0            63.0             9.0
3460     2961203      72.0            70.0             2.0
3486     2964987      29.0            33.0             4.0

[1096 rows x 4 columns]
