In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing

# DataFrame
df = pd.read_csv('cleaned_cupid.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,offspring,pets,religion,smokes,speaks,preference
0,0,22,single,m,straight,curvy,anything,socially,never,college/university,mixed,75.0,unspecified,Transportation & Military,"Does Not Have a Kid, Does Not Want Any More Kids",Likes Both,agnosticism,sometimes,english,adventurous
1,1,35,single,m,straight,average,other,often,sometimes,space camp,white,70.0,avg,Service & Hospitality,"Does Not Have a Kid, Does Not Want Any More Kids",Likes Both,agnosticism,no,spanish,adventurous
2,2,38,available,m,straight,thin,anything,socially,,masters program,,68.0,unspecified,,,Likes Only Cats,,no,others,adventurous
3,3,23,single,m,straight,thin,vegetarian,socially,,college/university,white,71.0,below_25,Student,"Does Not Have a Kid, Does Not Want Any More Kids",Likes Only Cats,,no,german,academic
4,4,29,single,m,straight,athletic,,socially,never,college/university,mixed,66.0,unspecified,Creative & Media,,Likes Both,,no,english,adventurous


# Statistical test Ethnicity (white or other) vs. offspring

In [2]:
from scipy.stats import chi2_contingency

# Step 1: Simplify ethnicity into 'white' and 'other'
df['ethnicity_simplified'] = df['ethnicity'].apply(lambda x: 'other' if x != 'white' else 'white')

# Step 2: Filter offspring categories
offspring_categories = ["Does Not Have a Kid, Does Not Want Any More Kids", "Does Not Have a Kid, Want Kids"]
filtered_df = df[df['offspring'].isin(offspring_categories)]

# Prepare the contingency table for the chi-square test
contingency_table = pd.crosstab(filtered_df['ethnicity_simplified'], filtered_df['offspring'])

# Step 3: Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

chi2, p, dof, expected, contingency_table


(48.70859093767093,
 2.9696276539729606e-12,
 1,
 array([[3407.23517554, 1705.76482446],
        [4526.76482446, 2266.23517554]]),
 offspring             Does Not Have a Kid, Does Not Want Any More Kids  \
 ethnicity_simplified                                                     
 other                                                             3229   
 white                                                             4705   
 
 offspring             Does Not Have a Kid, Want Kids  
 ethnicity_simplified                                  
 other                                           1884  
 white                                           2088  )

The chi-square test of independence between ethnicity (simplified into "white" and "other") and offspring category (specifically, "Does Not Have a Kid, Does Not Want Any More Kids" and "Does Not Have a Kid, Want Kids") resulted in a chi-square statistic of approximately 48.71, with a p-value of approximately 2.97×10−12 and 1 degree of freedom.

Given the very small p-value, we reject the null hypothesis of independence. This suggests that there is a significant association between the simplified ethnicity categories and the specified offspring categories. The expected frequencies, compared to the observed frequencies, indicate where the differences lie.


# Statistical test Men and Female (aged 27-35) vs. offspring

In [3]:
# Filter the data for age 27-35 and sex (male or female)
age_filtered_df = df[(df['age'] >= 27) & (df['age'] <= 35) & (df['sex'].isin(['m', 'f']))]

# Filter offspring categories as before
offspring_filtered_df = age_filtered_df[age_filtered_df['offspring'].isin(offspring_categories)]

# Prepare the contingency table for the chi-square test
contingency_table_age_sex = pd.crosstab(offspring_filtered_df['sex'], offspring_filtered_df['offspring'])

# Perform the chi-square test
chi2_age_sex, p_age_sex, dof_age_sex, expected_age_sex = chi2_contingency(contingency_table_age_sex)

chi2_age_sex, p_age_sex, dof_age_sex, expected_age_sex, contingency_table_age_sex


(153.60066183231143,
 2.831397035176791e-35,
 1,
 array([[1376.04679708,  980.95320292],
        [1667.95320292, 1189.04679708]]),
 offspring  Does Not Have a Kid, Does Not Want Any More Kids  \
 sex                                                           
 f                                                      1156   
 m                                                      1888   
 
 offspring  Does Not Have a Kid, Want Kids  
 sex                                        
 f                                    1201  
 m                                     969  )

The chi-square test of independence between gender (males vs. females within the age range of 27-35) and offspring category (specifically, "Does Not Have a Kid, Does Not Want Any More Kids" and "Does Not Have a Kid, Want Kids") resulted in a chi-square statistic of approximately 153.60, with a p-value of approximately 2.83×10−35 and 1 degree of freedom.

With this very small p-value, we reject the null hypothesis of independence, indicating a significant association between gender within the specified age range and the specified offspring categories. The expected frequencies, compared to the observed frequencies, highlight the differences in preferences or situations regarding offspring.


# Statistical test Religion vs. offspring

In [4]:
# Filter offspring categories as before (no need to filter again, using offspring_filtered_df)
# Prepare the contingency table for the chi-square test between religion and offspring categories
contingency_table_religion = pd.crosstab(offspring_filtered_df['religion'], offspring_filtered_df['offspring'])

# Perform the chi-square test
chi2_religion, p_religion, dof_religion, expected_religion = chi2_contingency(contingency_table_religion)

chi2_religion, p_religion, dof_religion, expected_religion, contingency_table_religion.head()


(370.7478629358021,
 3.358662501565251e-75,
 8,
 array([[570.71613723, 381.28386277],
        [498.17763659, 332.82236341],
        [ 97.11766201,  64.88233799],
        [285.95756036, 191.04243964],
        [331.51893266, 221.48106734],
        [ 23.38017789,  15.61982211],
        [  7.19390089,   4.80609911],
        [149.8729352 , 100.1270648 ],
        [395.06505718, 263.93494282]]),
 offspring     Does Not Have a Kid, Does Not Want Any More Kids  \
 religion                                                         
 agnosticism                                                628   
 atheism                                                    651   
 buddhism                                                    96   
 catholicism                                                185   
 christianity                                               222   
 
 offspring     Does Not Have a Kid, Want Kids  
 religion                                      
 agnosticism                             

The chi-square test of independence between religion and offspring category (specifically, "Does Not Have a Kid, Does Not Want Any More Kids" and "Does Not Have a Kid, Want Kids") yielded a chi-square statistic of approximately 370.75, with a p-value of approximately
3.36×10 −75 and 8 degrees of freedom.

Given this extremely small p-value, we reject the null hypothesis of independence, indicating a significant association between religious affiliation and the specified offspring categories. The expected frequencies, compared to the observed frequencies, suggest differences in offspring preferences or situations across different religions.

# Statistical test Education level vs. offspring

In [5]:
# Define high education levels
high_education_levels = ['masters program', 'law school', 'med school', 'ph.d program']

# Filter the dataset for high education levels
offspring_filtered_df['high_educated'] = offspring_filtered_df['education'].isin(high_education_levels).map({True: 'Highly Educated', False: 'Not Highly Educated'})

# Prepare the contingency table for the chi-square test between high education levels and offspring categories
contingency_table_education = pd.crosstab(offspring_filtered_df['high_educated'], offspring_filtered_df['offspring'])

# Perform the chi-square test
chi2_education, p_education, dof_education, expected_education = chi2_contingency(contingency_table_education)

chi2_education, p_education, dof_education, expected_education, contingency_table_education


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offspring_filtered_df['high_educated'] = offspring_filtered_df['education'].isin(high_education_levels).map({True: 'Highly Educated', False: 'Not Highly Educated'})


(50.224453690427254,
 1.371282982810214e-12,
 1,
 array([[ 922.42424242,  657.57575758],
        [2121.57575758, 1512.42424242]]),
 offspring            Does Not Have a Kid, Does Not Want Any More Kids  \
 high_educated                                                           
 Highly Educated                                                   806   
 Not Highly Educated                                              2238   
 
 offspring            Does Not Have a Kid, Want Kids  
 high_educated                                        
 Highly Educated                                 774  
 Not Highly Educated                            1396  )

The chi-square test of independence between high education level (categorized as "Highly Educated" for those with masters, law, med, or Ph.D. programs and "Not Highly Educated" for all others) and offspring category (specifically, "Does Not Have a Kid, Does Not Want Any More Kids" and "Does Not Have a Kid, Want Kids") resulted in a chi-square statistic of approximately 50.22, with a p-value of approximately 1.37×10−12 and 1 degree of freedom.

With this significantly small p-value, we reject the null hypothesis of independence, suggesting a significant association between the level of education and the specified offspring categories. The expected frequencies, when compared to the observed frequencies, indicate the differences in preferences or situations regarding offspring among different education levels.
