# Speed Dating

## Contents

1) Initial exploration of the dataset
2) Principal 
3) Testing underlying assumptions

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
pd.set_option("max_seq_items", None)
pd.set_option('display.max_rows', None, 'display.max_columns', None)


In [2]:
df = pd.read_csv("data\speed_dating_data.csv", encoding = "ISO-8859-1")


## General exploration of the dataset

In [3]:
df["age"].describe()

count    8283.000000
mean       26.358928
std         3.566763
min        18.000000
25%        24.000000
50%        26.000000
75%        28.000000
max        55.000000
Name: age, dtype: float64

As there are a lot of columns in the dataset, I printed out the names of columns to get an overview of the features of the dataset.

In [4]:
print(df.columns)

Index(['iid', 'id', 'gender', 'idg', 'condtn', 'wave', 'round', 'position',
       'positin1', 'order', 'partner', 'pid', 'match', 'int_corr', 'samerace',
       'age_o', 'race_o', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun',
       'pf_o_amb', 'pf_o_sha', 'dec_o', 'attr_o', 'sinc_o', 'intel_o', 'fun_o',
       'amb_o', 'shar_o', 'like_o', 'prob_o', 'met_o', 'age', 'field',
       'field_cd', 'undergra', 'mn_sat', 'tuition', 'race', 'imprace',
       'imprelig', 'from', 'zipcode', 'income', 'goal', 'date', 'go_out',
       'career', 'career_c', 'sports', 'tvsports', 'exercise', 'dining',
       'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv',
       'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga',
       'exphappy', 'expnum', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1',
       'amb1_1', 'shar1_1', 'attr4_1', 'sinc4_1', 'intel4_1', 'fun4_1',
       'amb4_1', 'shar4_1', 'attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1',
       'amb2_1', 'shar2_1', 'attr3_1', 'si

Let's check whether there are missing values in the dataset if any.

In [5]:

df.isnull().sum()

iid            0
id             1
gender         0
idg            0
condtn         0
wave           0
round          0
position       0
positin1    1846
order          0
partner        0
pid           10
match          0
int_corr     158
samerace       0
age_o        104
race_o        73
pf_o_att      89
pf_o_sin      89
pf_o_int      89
pf_o_fun      98
pf_o_amb     107
pf_o_sha     129
dec_o          0
attr_o       212
sinc_o       287
intel_o      306
fun_o        360
amb_o        722
shar_o      1076
like_o       250
prob_o       318
met_o        385
age           95
field         63
field_cd      82
undergra    3464
mn_sat      5245
tuition     4795
race          63
imprace       79
imprelig      79
from          79
zipcode     1064
income      4099
goal          79
date          97
go_out        79
career        89
career_c     138
sports        79
tvsports      79
exercise      79
dining        79
museums       79
art           79
hiking        79
gaming        79
clubbing      

## General characteristics of the dataset demographic.

Each participant in the study had a unique number ("iid"). The original dataset contains several rows for each iid, each row corresponding to a different partner that the person with the given iid met during speed dating. 

I grouped the dataset by the iid in order to count the number of unique participants in the study.

In [6]:
df_iid = df.groupby("iid").mean(numeric_only=True).reset_index(drop=False)

print(f"Total number of participants in this study: {len(df_iid)}")

Total number of participants in this study: 551


In the dataset, the column "gender" contains information whether the participant was male (1) or female (0). To avoid confusion, I chose to create a column "gender_name" where the gender of each participant is indicated explicitly as "Male" or "Female".

In [7]:
df_iid["gender_name"] = df_iid['gender'].apply(lambda x: "Male" if x==1 else "Female")
df_iid[['iid', 'gender', 'age', 'gender_name']].head()


Unnamed: 0,iid,gender,age,gender_name
0,1,0.0,21.0,Female
1,2,0.0,24.0,Female
2,3,0.0,25.0,Female
3,4,0.0,23.0,Female
4,5,0.0,21.0,Female


Each iid corresponds to a unique participant. I used the dataset grouped by iid to count the number of unique values in column "iid" to determine the total number of participants. I also used the dataset to count the number of men and women who participated in the study.

In [8]:
df_iid[df_iid["gender_name"] == "Male"].head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,partner,pid,match,int_corr,samerace,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field_cd,race,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1,amb5_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,attr1_s,sinc1_s,intel1_s,fun1_s,amb1_s,shar1_s,attr3_s,sinc3_s,intel3_s,fun3_s,amb3_s,satis_2,length,numdat_2,attr7_2,sinc7_2,intel7_2,fun7_2,amb7_2,shar7_2,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2,attr4_2,sinc4_2,intel4_2,fun4_2,amb4_2,shar4_2,attr2_2,sinc2_2,intel2_2,fun2_2,amb2_2,shar2_2,attr3_2,sinc3_2,intel3_2,fun3_2,amb3_2,attr5_2,sinc5_2,intel5_2,fun5_2,amb5_2,you_call,them_cal,date_3,numdat_3,num_in_3,attr1_3,sinc1_3,intel1_3,fun1_3,amb1_3,shar1_3,attr7_3,sinc7_3,intel7_3,fun7_3,amb7_3,shar7_3,attr4_3,sinc4_3,intel4_3,fun4_3,amb4_3,shar4_3,attr2_3,sinc2_3,intel2_3,fun2_3,amb2_3,shar2_3,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3,gender_name
10,11,1.0,1.0,2.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.0,0.056,0.6,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.4,5.6,8.8,8.2,5.4,6.4,5.222222,6.2,5.2,1.9,27.0,8.0,2.0,7.0,3.0,1.0,5.0,4.0,2.0,8.0,7.0,2.0,6.0,7.0,5.0,5.0,5.0,4.0,9.0,2.0,4.0,8.0,7.0,8.0,5.0,1.0,7.0,3.0,35.0,20.0,20.0,20.0,0.0,5.0,,,,,,,25.0,5.0,20.0,20.0,25.0,5.0,8.0,9.0,7.0,8.0,5.0,,,,,,0.0,6.8,7.8,7.2,7.6,7.1,5.1,6.4,3.5,1.9,0.0,,,,,,,,,,,,7.0,2.0,,,,,,,,19.51,17.07,17.07,17.07,12.2,17.07,,,,,,,,,,,,,8.0,8.0,8.0,6.0,5.0,,,,,,0.0,0.0,0.0,,,35.0,25.0,15.0,15.0,0.0,10.0,,,,,,,,,,,,,,,,,,,7.0,8.0,7.0,7.0,5.0,,,,,,Male
11,12,2.0,1.0,4.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.2,0.155,0.6,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.4,7.1,6.6,7.1,6.7,6.9,6.111111,6.9,4.2,1.5,22.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,7.0,9.0,8.0,7.0,6.0,3.0,3.0,5.0,6.0,6.0,4.0,7.0,7.0,9.0,5.0,5.0,7.0,20.0,60.0,0.0,0.0,40.0,0.0,0.0,,,,,,,25.0,15.0,25.0,20.0,15.0,0.0,9.0,9.0,9.0,10.0,9.0,,,,,,0.3,6.2,6.7,9.1,5.9,5.7,4.7,6.9,3.5,1.7,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Male
12,13,3.0,1.0,6.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.4,0.119,0.3,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.4,4.8,7.9,8.8,7.0,5.9,5.888889,6.4,5.111111,1.6,22.0,1.0,4.0,3.0,5.0,2.0,7.0,1.0,1.0,7.0,8.0,2.0,9.0,5.0,6.0,4.0,7.0,7.0,6.0,8.0,10.0,8.0,9.0,9.0,8.0,1.0,3.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,,,,,,,20.0,16.0,16.0,18.0,16.0,14.0,4.0,7.0,8.0,8.0,3.0,,,,,,1.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.625,,,,,,,,,,,,,10.0,3.0,1.0,,,,,,,20.0,20.0,20.0,20.0,7.5,12.5,,,,,,,,,,,,,3.0,7.0,7.0,8.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Male
13,14,4.0,1.0,8.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.8,-0.057,0.6,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.8,8.2,7.6,7.4,7.6,6.6,7.444444,7.7,5.5,1.6,23.0,1.0,2.0,1.0,1.0,2.0,4.0,1.0,1.0,10.0,6.0,8.0,8.0,3.0,3.0,10.0,8.0,8.0,6.0,7.0,3.0,10.0,6.0,8.0,6.0,1.0,8.0,15.0,30.0,5.0,15.0,40.0,5.0,5.0,,,,,,,20.0,15.0,20.0,20.0,5.0,20.0,9.0,9.0,9.0,9.0,9.0,,,,,,1.0,7.2,8.1,7.8,8.2,8.7,8.3,7.5,7.3,1.6,,,,,,,,,,,,,6.0,3.0,,,,,,,,20.0,14.29,17.14,22.86,14.29,11.43,,,,,,,,,,,,,9.0,9.0,9.0,9.0,9.0,,,,,,1.0,1.0,0.0,,,30.0,10.0,20.0,30.0,5.0,5.0,,,,,,,,,,,,,,,,,,,9.0,9.0,9.0,9.0,9.0,,,,,,Male
14,15,5.0,1.0,10.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.3,0.052,0.0,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.3,5.7,7.6,7.7,7.2,6.5,6.666667,6.3,4.777778,1.7,24.0,1.0,3.0,3.0,1.0,1.0,4.0,1.0,1.0,9.0,7.0,9.0,7.0,4.0,3.0,6.0,7.0,9.0,8.0,6.0,9.0,9.0,6.0,7.0,2.0,1.0,5.0,9.0,30.0,10.0,20.0,10.0,10.0,20.0,,,,,,,20.0,20.0,20.0,20.0,10.0,10.0,7.0,7.0,7.0,9.0,9.0,,,,,,0.9,7.6,8.5,8.8,7.5,8.25,6.75,7.7,4.9,1.8,,,,,,,,,,,,,8.0,2.0,1.0,,,,,,,21.21,18.18,12.12,21.21,6.06,21.21,,,,,,,,,,,,,7.0,7.0,9.0,7.0,9.0,,,,,,3.0,0.0,0.0,,,30.0,10.0,20.0,20.0,5.0,15.0,,,,,,,,,,,,,,,,,,,7.0,7.0,9.0,7.0,9.0,,,,,,Male


In [9]:
nb_women = len(df_iid[df_iid["gender_name"] == "Female"])
nb_men = len(df_iid[df_iid["gender_name"] == "Male"])
print(f"There were {nb_men} male participants and {nb_women} female participants in the study.")


There were 277 male participants and 274 female participants in the study.


In [10]:
fig = px.box(df_iid, x="gender_name", y="age", color="gender_name", color_discrete_sequence=["#F78FE4", "#0460C9"])
fig.show()

In [11]:
type(df_iid[df_iid["gender_name"] == "Female"])

pandas.core.frame.DataFrame

In [12]:
import plotly.figure_factory as ff
import numpy as np

x1= df_iid[df_iid["gender_name"] == "Female"]["age"].dropna().copy()
x2= df_iid[df_iid["gender_name"] == "Male"]["age"].dropna().copy()
x1 = x1.to_numpy()
x2 = x2.to_numpy()

hist_data = [x1, x2]

group_labels = ['Female', 'Male']
colors = ["#F78FE4", "#0460C9"]

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=1, show_rug=False)

# Add title
fig.update_layout(title_text='Hist and Curve Plot')
fig.show()

In [13]:
import plotly.figure_factory as ff
import numpy as np

x1 = np.random.randn(200) - 2
x2 = np.random.randn(200)
x3 = np.random.randn(200) + 2

hist_data = [x1, x2, x3]

group_labels = ['Group 1', 'Group 2', 'Group 3']
colors = ['#A56CC1', '#A6ACEC', '#63F5EF']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=.2, show_rug=False)

# Add title
fig.update_layout(title_text='Hist and Curve Plot')
fig.show()

In [14]:

fig = px.histogram(df_iid, x="age",
             color='gender_name', barmode='group', color_discrete_sequence=["#F78FE4", "#1179f0"])
fig.show()

Education field:

In [15]:
len(df.columns)

195

In [16]:
len(df_iid.columns)

188

The initial dataset contained columns "field" and "field_cd". After groupby, only the numerical columns were left in the new dataframe, so I decided to recreate a column containing the name of the field of study.

In [17]:
df_iid['field_name'] = df_iid["field_cd"].apply(lambda x: "Law" if x == 1 
                                                    else "Math" if x == 2 
                                                    else "Social Science / Psychology" if x == 3 
                                                    else "Medical Science / Pharma / Biotech" if x == 4 
                                                    else "Engineering" if x==5
                                                    else "English / Creative Writing / Journalism" if x == 6 
                                                    else "History / Religion / Philosopy" if x == 7 
                                                    else "Business / Economy / Finance" if x == 8 
                                                    else "Education, Academia" if x==9
                                                    else "Biology / Chemistry / Physics" if x==10
                                                    else "Social Work" if x == 11
                                                    else "Undergrad / Undecided" if x == 12 
                                                    else "Political Science and International Affairs" if x == 13 
                                                    else "Film" if x==14
                                                    else "Fine Arts / Arts Administration" if x == 15 
                                                    else "Languages" if x == 16
                                                    else "Architecture" if x==17 
                                                    else "Other")

In [18]:
import plotly.express as px
fig = px.histogram(df_iid, y="field_name", opacity = 0.75)
fig.show()

In [19]:
import plotly.express as px
fig = px.histogram(df_iid, y="field_name", opacity = 0.75).update_yaxes(categoryorder = "total ascending")
fig.show()

In [20]:
import plotly.express as px
fig = px.histogram(df_iid, y="field_name", color = "field_cd", opacity = 0.75).update_yaxes(categoryorder = "total ascending")
fig.show()

In [21]:
fig = px.histogram(df_iid, y="field_name",
             color='field_name', opacity = 0.75)
fig.show()

To have a more general view of the participants in the study, I wanted to: 
1) Regroup some fields of study chosen by the authors of the experiment into more general categories. I chose to group them according to the system used by the French ONISEP ("Office national d'information sur les enseignements et les professions") so that the categores were more understandable for a reader from France.
2) For each of the fields that were obtained in this manner, percentage of male and female students was calculated.

In [22]:
df_iid['field_name'] = df_iid["field_cd"].apply(lambda x: "Law" if x == 1 
                                                    else "Math" if x == 2 
                                                    else "Social Science / Psychology" if x == 3 
                                                    else "Medical Science / Pharma / Biotech" if x == 4 
                                                    else "Engineering" if x==5
                                                    else "English / Creative Writing / Journalism" if x == 6 
                                                    else "History / Religion / Philosopy" if x == 7 
                                                    else "Business / Economy / Finance" if x == 8 
                                                    else "Education, Academia" if x==9
                                                    else "Biology / Chemistry / Physics" if x==10
                                                    else "Social Work" if x == 11
                                                    else "Undergrad / Undecided" if x == 12 
                                                    else "Political Science and International Affairs" if x==13 
                                                    else "Film" if x==14
                                                    else "Fine Arts / Arts Administration" if x==15 
                                                    else "Languages" if x == 16
                                                    else "Architecture" if x==17 
                                                    else "Other")

In [23]:
df_iid['domain_of_study'] = df_iid["field_cd"].apply(lambda x: "Arts" if x==14 or x==15
                                                    else "Economy and Management" if x==8
                                                    else "Sciences" if x==2 or x==5 or x ==10
                                                    else "Medicine, Pharma, Biotech" if x==4
                                                    else "Education, Academia" if x==9
                                                    else "Law and Political Science" if x==1 or x==13
                                                    else "Literature and Languages" if x==6 or x ==16
                                                    else "Social Sciences, Humanities, Psychology" if x==3 or x==7
                                                    else "Education, Academia" if x==9
                                                    else "Social Work" if x==11
                                                    else "Architecture" if x==17
                                                    else "Undergrad / Undecided" if x==12
                                                    else "Other")

In [24]:
fig = px.histogram(df_iid, y="domain_of_study", opacity = 0.75).update_yaxes(categoryorder = "total ascending")
fig.show()

In [52]:
fig = px.histogram(df_iid, y="domain_of_study", opacity = 0.75, histnorm="percent").update_yaxes(categoryorder = "total ascending")
fig.show()

In [80]:
df_iid.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,partner,pid,match,int_corr,samerace,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field_cd,race,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1,amb5_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,attr1_s,sinc1_s,intel1_s,fun1_s,amb1_s,shar1_s,attr3_s,sinc3_s,intel3_s,fun3_s,amb3_s,satis_2,length,numdat_2,attr7_2,sinc7_2,intel7_2,fun7_2,amb7_2,shar7_2,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2,attr4_2,sinc4_2,intel4_2,fun4_2,amb4_2,shar4_2,attr2_2,sinc2_2,intel2_2,fun2_2,amb2_2,shar2_2,attr3_2,sinc3_2,intel3_2,fun3_2,amb3_2,attr5_2,sinc5_2,intel5_2,fun5_2,amb5_2,you_call,them_cal,date_3,numdat_3,num_in_3,attr1_3,sinc1_3,intel1_3,fun1_3,amb1_3,shar1_3,attr7_3,sinc7_3,intel7_3,fun7_3,amb7_3,shar7_3,attr4_3,sinc4_3,intel4_3,fun4_3,amb4_3,shar4_3,attr2_3,sinc2_3,intel2_3,fun2_3,amb2_3,shar2_3,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3,gender_name,field_name,domain_of_study,participant_goal
0,1,1.0,0.0,1.0,1.0,1.0,10.0,7.0,,5.5,5.5,15.5,0.4,0.267,0.1,25.2,2.3,44.233,7.911,16.511,16.911,4.511,9.922,0.5,6.7,7.4,8.0,7.2,8.0,7.1,6.85,5.7,1.9,21.0,1.0,4.0,2.0,4.0,2.0,7.0,1.0,,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,2.0,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,35.0,20.0,15.0,20.0,5.0,5.0,6.0,8.0,8.0,8.0,7.0,,,,,,0.8,5.7,7.3,7.3,6.8,6.3,6.5,6.5,5.888889,1.777778,4.0,,,,,,,,,,,,6.0,2.0,1.0,,,,,,,19.44,16.67,13.89,22.22,11.11,16.67,,,,,,,,,,,,,6.0,7.0,8.0,7.0,6.0,,,,,,1.0,1.0,0.0,,,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,,,,,,,,,,,,,5.0,7.0,7.0,7.0,7.0,,,,,,Female,Law,Law and Political Science,To meet new people
1,2,2.0,0.0,3.0,1.0,1.0,10.0,3.0,,5.5,5.5,15.5,0.2,0.258,0.8,25.2,2.3,44.233,7.911,16.511,16.911,4.511,9.922,0.6,7.7,7.1,7.9,7.5,7.5,6.5,7.6,5.5,1.9,24.0,1.0,2.0,2.0,5.0,1.0,5.0,1.0,,3.0,2.0,7.0,10.0,8.0,6.0,3.0,5.0,8.0,10.0,1.0,9.0,8.0,7.0,8.0,3.0,1.0,4.0,5.0,45.0,5.0,25.0,20.0,0.0,5.0,,,,,,,65.0,0.0,10.0,25.0,0.0,0.0,7.0,5.0,10.0,8.0,3.0,,,,,,0.4,6.4,7.0,7.7,6.1,6.5,5.7,6.6,5.9,1.9,3.0,,,,,,,,,,,,5.0,2.0,,,,,,,,18.92,18.92,21.62,27.03,5.41,8.11,,,,,,,,,,,,,7.0,6.0,8.0,9.0,4.0,,,,,,0.0,0.0,0.0,,,30.0,5.0,40.0,15.0,0.0,10.0,,,,,,,,,,,,,,,,,,,7.0,6.0,9.0,9.0,4.0,,,,,,Female,Law,Law and Political Science,Seemed like a fun night out
2,3,3.0,0.0,5.0,1.0,1.0,10.0,9.0,,5.5,5.5,15.5,0.0,-0.136,0.8,25.2,2.3,44.233,7.911,16.511,16.911,4.511,9.922,0.5,6.5,7.1,7.3,6.2,7.111111,6.0,6.0,4.5,1.9,25.0,2.0,2.0,8.0,4.0,6.0,3.0,1.0,,3.0,8.0,7.0,8.0,5.0,5.0,8.0,4.0,5.0,7.0,8.0,7.0,7.0,7.0,5.0,8.0,7.0,4.0,2.0,35.0,10.0,35.0,10.0,10.0,0.0,,,,,,,50.0,0.0,20.0,30.0,0.0,0.0,8.0,9.0,8.0,9.0,8.0,,,,,,0.0,8.1,8.6,9.4,7.7,8.8,8.1,8.2,7.0,1.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Female,Math,Sciences,Other
3,4,4.0,0.0,7.0,1.0,1.0,10.0,6.0,,5.5,5.5,15.5,0.2,-0.007,0.8,25.2,2.3,44.233,7.911,16.511,16.911,4.511,9.922,0.6,7.0,7.1,7.7,7.5,7.7,7.2,7.3,6.5,1.777778,23.0,1.0,2.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,6.0,7.0,6.0,7.0,7.0,5.0,7.0,7.0,7.0,9.0,7.0,8.0,7.0,1.0,8.0,1.0,2.0,20.0,20.0,20.0,20.0,10.0,10.0,,,,,,,30.0,10.0,15.0,30.0,5.0,10.0,7.0,8.0,9.0,7.0,8.0,,,,,,0.3,6.4,8.9,8.6,7.8,7.8,7.1,6.6,5.2,1.8,2.0,,,,,,,,,,,,4.0,3.0,2.0,,,,,,,24.14,13.79,20.69,27.59,10.34,3.45,,,,,,,,,,,,,6.0,8.0,7.0,8.0,6.0,,,,,,0.0,0.0,0.0,,,20.0,20.0,20.0,20.0,0.0,20.0,,,,,,,,,,,,,,,,,,,6.0,5.0,6.0,8.0,5.0,,,,,,Female,Law,Law and Political Science,Seemed like a fun night out
4,5,5.0,0.0,9.0,1.0,1.0,10.0,4.0,,5.5,5.5,15.5,0.2,-0.079,0.8,25.2,2.3,44.233,7.911,16.511,16.911,4.511,9.922,0.3,5.3,7.7,7.6,7.2,7.8,6.2,6.1,6.5,1.7,21.0,1.0,2.0,8.0,1.0,2.0,4.0,1.0,1.0,7.0,4.0,7.0,7.0,6.0,8.0,6.0,6.0,8.0,6.0,8.0,6.0,6.0,3.0,7.0,8.0,3.0,7.0,10.0,20.0,5.0,25.0,25.0,10.0,15.0,,,,,,,50.0,10.0,10.0,20.0,5.0,5.0,6.0,3.0,6.0,10.0,8.0,,,,,,0.6,6.3,6.0,7.0,6.0,5.6,6.2,7.2,3.7,1.7,,,,,,,,,,,,,7.0,2.0,2.0,,,,,,,15.79,13.16,18.42,15.79,15.79,21.05,,,,,,,,,,,,,6.0,6.0,9.0,9.0,9.0,,,,,,0.0,0.0,0.0,,,30.0,10.0,20.0,20.0,10.0,10.0,,,,,,,,,,,,,,,,,,,4.0,5.0,10.0,6.0,10.0,,,,,,Female,Law,Law and Political Science,To meet new people


In [82]:
new_df = df_iid[["domain_of_study", "gender_name"]]
new_df.head()

Unnamed: 0,domain_of_study,gender_name
0,Law and Political Science,Female
1,Law and Political Science,Female
2,Sciences,Female
3,Law and Political Science,Female
4,Law and Political Science,Female


In [88]:
domain_by_gender_series = new_df.groupby(["domain_of_study", "gender_name"]).size()

In [90]:
type(domain_by_gender_series)

pandas.core.series.Series

In [91]:
domain_by_gender_series

domain_of_study                          gender_name
Architecture                             Male             1
Arts                                     Female          12
                                         Male             6
Economy and Management                   Female          30
                                         Male           100
Education, Academia                      Female          35
                                         Male             5
Law and Political Science                Female          45
                                         Male            49
Literature and Languages                 Female          18
                                         Male             6
Medicine, Pharma, Biotech                Female           7
                                         Male             2
Other                                    Female           5
                                         Male             5
Sciences                                 Female

In [95]:
new_df = domain_by_gender_series.to_frame(name = 'size').reset_index()

In [96]:
new_df

Unnamed: 0,domain_of_study,gender_name,size
0,Architecture,Male,1
1,Arts,Female,12
2,Arts,Male,6
3,Economy and Management,Female,30
4,Economy and Management,Male,100
5,"Education, Academia",Female,35
6,"Education, Academia",Male,5
7,Law and Political Science,Female,45
8,Law and Political Science,Male,49
9,Literature and Languages,Female,18


In [104]:
group_weights = new_df.groupby('domain_of_study').aggregate(sum)
new_df['gender_percent_by_domain'] = new_df.apply(lambda row: row['size']/group_weights.loc[row['domain_of_study']][0],axis=1)


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [105]:
new_df

Unnamed: 0,domain_of_study,gender_name,size,gender_percent_by_domain
0,Architecture,Male,1,1.0
1,Arts,Female,12,0.666667
2,Arts,Male,6,0.333333
3,Economy and Management,Female,30,0.230769
4,Economy and Management,Male,100,0.769231
5,"Education, Academia",Female,35,0.875
6,"Education, Academia",Male,5,0.125
7,Law and Political Science,Female,45,0.478723
8,Law and Political Science,Male,49,0.521277
9,Literature and Languages,Female,18,0.75


In [156]:
new_df["percentage"] = new_df["gender_percent_by_domain"].apply(lambda x: str(round(x*100, 1)) + " %")

In [157]:
new_df

Unnamed: 0,domain_of_study,gender_name,size,gender_percent_by_domain,percentage
20,Social Work,Male,1,0.033333,3.3 %
6,"Education, Academia",Male,5,0.125,12.5 %
12,"Medicine, Pharma, Biotech",Male,2,0.222222,22.2 %
3,Economy and Management,Female,30,0.230769,23.1 %
10,Literature and Languages,Male,6,0.25,25.0 %
18,"Social Sciences, Humanities, Psychology",Male,19,0.301587,30.2 %
2,Arts,Male,6,0.333333,33.3 %
15,Sciences,Female,49,0.374046,37.4 %
7,Law and Political Science,Female,45,0.478723,47.9 %
14,Other,Male,5,0.5,50.0 %


In [158]:
fig=px.bar(new_df, x='gender_percent_by_domain', y='domain_of_study',
            color=new_df['gender_name'], barmode ='stack', color_discrete_sequence=["#1179f0", "#F78FE4"], opacity = 0.85,
            text = new_df['percentage']).update_yaxes(categoryorder = "total ascending")
fig.update_traces(textfont_color='#ffffff')
fig.show()

In [159]:
new_df.sort_values(by=['gender_percent_by_domain'], inplace=True)

In [160]:
new_df

Unnamed: 0,domain_of_study,gender_name,size,gender_percent_by_domain,percentage
20,Social Work,Male,1,0.033333,3.3 %
6,"Education, Academia",Male,5,0.125,12.5 %
12,"Medicine, Pharma, Biotech",Male,2,0.222222,22.2 %
3,Economy and Management,Female,30,0.230769,23.1 %
10,Literature and Languages,Male,6,0.25,25.0 %
18,"Social Sciences, Humanities, Psychology",Male,19,0.301587,30.2 %
2,Arts,Male,6,0.333333,33.3 %
15,Sciences,Female,49,0.374046,37.4 %
7,Law and Political Science,Female,45,0.478723,47.9 %
13,Other,Female,5,0.5,50.0 %


In [161]:
fig=px.bar(new_df, x='gender_percent_by_domain', y='domain_of_study',
            color=new_df['gender_name'], barmode ='stack', color_discrete_sequence=["#1179f0", "#F78FE4"], opacity = 0.85,
            text = new_df['percentage']).update_yaxes(categoryorder = "total ascending")
fig.update_traces(textfont_color='#ffffff')
fig.show()

In [None]:
fig=px.bar(new_df, x='gender_percent_by_domain', y='domain_of_study',
            color=new_df['gender_name'], barmode ='stack', color_discrete_sequence=["#1179f0", "#F78FE4"], opacity = 0.85,
            text = new_df['percentage']).update_yaxes(categoryorder = "total ascending")
fig.update_traces(textfont_color='#ffffff')
fig.show()

In [175]:
import plotly.graph_objects as go

fig = go.Figure(go.Bar(
            x=[20, 14, 23],
            y=['giraffes', 'orangutans', 'monkeys'],
            orientation='h',
            # define the annotations
            text=['giraffes', 'orangutans', 'monkeys'],
            # position, "auto", "inside" or "outside"
            textposition="auto",
            # anchor could be "start" or "end"
            insidetextanchor="middle",
            insidetextfont=dict(family='Times', size=13, color='white'),
            outsidetextfont=dict(family='Times', size=13, color='white')))
fig.update_layout(
    yaxis=dict(
        showticklabels=False,
    ))
fig.show()

In [176]:
import plotly.graph_objects as go

fig = go.Figure(go.Bar(
            x=[20, 14, 23],
            y=['giraffes', 'orangutans', 'monkeys'],
            orientation='h',
            # define the annotations
            text=['giraffes', 'orangutans', 'monkeys'],
            # position, "auto", "inside" or "outside"
            textposition="auto",
            # anchor could be "start" or "end"
            insidetextanchor="middle"))
fig.update_layout(
    yaxis=dict(
        showticklabels=False,
    ))
fig.show()

In [173]:

fig=px.bar(new_df, x='gender_percent_by_domain', y='domain_of_study',
            color=new_df['gender_name'], barmode ='stack', color_discrete_sequence=["#1179f0", "#F78FE4"], opacity = 0.85,
            text = new_df['percentage']).update_yaxes(categoryorder = "total ascending")
fig.update_traces(textfont_color='#ffffff')
fig.show()


TypeError: bar() got an unexpected keyword argument 'insidetextanchor'

In [165]:
import plotly.graph_objects as go

top_labels = ['Strongly<br>agree', 'Agree', 'Neutral', 'Disagree',
              'Strongly<br>disagree']

colors = ['rgba(38, 24, 74, 0.8)', 'rgba(71, 58, 131, 0.8)',
          'rgba(122, 120, 168, 0.8)', 'rgba(164, 163, 204, 0.85)',
          'rgba(190, 192, 213, 1)']

x_data = [[21, 30, 21, 16, 12],
          [24, 31, 19, 15, 11],
          [27, 26, 23, 11, 13],
          [29, 24, 15, 18, 14]]

y_data = ['The course was effectively<br>organized',
          'The course developed my<br>abilities and skills ' +
          'for<br>the subject', 'The course developed ' +
          'my<br>ability to think critically about<br>the subject',
          'I would recommend this<br>course to a friend']

fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=14,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd,
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=14,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

fig.update_layout(annotations=annotations)

fig.show()

In [168]:
top_labels = new_df["gender_name"].unique()
top_labels

array(['Male', 'Female'], dtype=object)

In [171]:
y_data = new_df["domain_of_study"].unique()
y_data

array(['Social Work', 'Education, Academia', 'Medicine, Pharma, Biotech',
       'Economy and Management', 'Literature and Languages',
       'Social Sciences, Humanities, Psychology', 'Arts', 'Sciences',
       'Law and Political Science', 'Other', 'Architecture',
       'Undergrad / Undecided'], dtype=object)

In [169]:
import plotly.graph_objects as go

top_labels = new_df["gender_name"].unique()

colors = ["#1179f0", "#F78FE4"]

x_data = [[21, 30, 21, 16, 12],
          [24, 31, 19, 15, 11],
          [27, 26, 23, 11, 13],
          [29, 24, 15, 18, 14]]

y_data = new_df["domain_of_study"].unique()

fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=14,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd,
                                    text=str(xd[i]) + '%',
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=14,
                                                  color='rgb(67, 67, 67)'),
                                        showarrow=False))
            space += xd[i]

fig.update_layout(annotations=annotations)

fig.show()

IndexError: list index out of range

In [None]:
fig=px.bar(new_df, x='gender_percent_by_domain', y='domain_of_study',
            color=new_df['gender_name'], barmode ='stack', color_discrete_sequence=["#1179f0", "#F78FE4"], opacity = 0.85,
            text = new_df['percentage']).update_yaxes(categoryorder = "total ascending")
fig.update_traces(textfont_color='#ffffff')
fig.show()

In [149]:
fig=px.bar(new_df, x='gender_percent_by_domain', y='domain_of_study',
            color=new_df['gender_name'], barmode ='stack', color_discrete_sequence=["#1179f0", "#F78FE4"], opacity = 0.75,
            text = new_df['percentage']).update_yaxes(categoryorder = " descending")

fig.show()

ValueError: 
    Invalid value of type 'builtins.str' received for the 'categoryorder' property of layout.yaxis
        Received value: ' descending'

    The 'categoryorder' property is an enumeration that may be specified as:
      - One of the following enumeration values:
            ['trace', 'category ascending', 'category descending',
            'array', 'total ascending', 'total descending', 'min
            ascending', 'min descending', 'max ascending', 'max
            descending', 'sum ascending', 'sum descending', 'mean
            ascending', 'mean descending', 'median ascending', 'median
            descending']

In [51]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import plotly.express as px

df = pd.DataFrame({
    'date': ['2022-01-07','2022-02-07','2022-03-07','2022-04-07','2022-05-07','2022-06-07','2022-07-07','2022-08-07'],
    'var1': [5,7,2,4,6,8,10,9],
    'var2': [6,7,8,5,2,6,3,1],
    'var3':[8,5,6,2,8,3,5,4],
    'var4':[7,9,7,5,3,4,2,1]})

print(df.head())

df_melt = df.melt(id_vars=['date'],var_name='var',value_name='Amount',value_vars=df.columns[1:],ignore_index=True)
print(df_melt.head())

df_melt['%'] = 100 * df_melt['Amount'] / df_melt.groupby('date')['Amount'].transform('sum')
print(df_melt.head())

fig = px.bar(df_melt, x="date", y="%",color='var',
        title="Bar Plot", 
        template="plotly_white")   
fig.update_layout(barmode="relative")
fig.update_layout(plot_bgcolor='white')
fig.update_yaxes(showline=False,showgrid=False)
fig.update_xaxes(showline=False,showgrid=False)
fig.show()

         date  var1  var2  var3  var4
0  2022-01-07     5     6     8     7
1  2022-02-07     7     7     5     9
2  2022-03-07     2     8     6     7
3  2022-04-07     4     5     2     5
4  2022-05-07     6     2     8     3
         date   var  Amount
0  2022-01-07  var1       5
1  2022-02-07  var1       7
2  2022-03-07  var1       2
3  2022-04-07  var1       4
4  2022-05-07  var1       6
         date   var  Amount          %
0  2022-01-07  var1       5  19.230769
1  2022-02-07  var1       7  25.000000
2  2022-03-07  var1       2   8.695652
3  2022-04-07  var1       4  25.000000
4  2022-05-07  var1       6  31.578947


In [25]:
fig = px.histogram(df_iid, y="domain_of_study", color_discrete_sequence=["#F78FE4", "#1179f0"], opacity = 0.75, color = "gender_name").update_yaxes(categoryorder = "total ascending")
fig.show()

In [48]:
fig = px.histogram(df_iid, y="domain_of_study", color_discrete_sequence=["#F78FE4", "#1179f0"], opacity = 0.75, color = "gender_name", histnorm="percent").update_yaxes(categoryorder = "total ascending")
fig.show()

Income:

The authors of the experiment did not ask for income directly but inferred it from the zipcode of the area where the participant grew up. Therefore the income provided in the dataset for each person is in fact the median household income in the area where the person grew up (according to the US Census Bureau data at the time of the experiment).


Motivation:

The authors asked each participant to indicate their goal for participating in the speed dating experiment.

In [26]:
df_iid['participant_goal'] = df_iid["goal"].apply(lambda x: "Seemed like a fun night out" if x == 1 
                                                    else "To meet new people" if x == 2 
                                                    else "To get a date" if x == 3 
                                                    else "Looking for a serious relationship" if x == 4 
                                                    else "To say I did it" if x==5 
                                                    else "Other")

In [27]:
import plotly.express as px
fig = px.histogram(df_iid, y="participant_goal",
             color='gender_name', barmode='group', color_discrete_sequence=["#F78FE4", "#1179f0"], opacity=0.60, histnorm = 'percent')
fig.show()

In [28]:
fig = px.histogram(df_iid, y="participant_goal",
             color='gender_name', barmode='group', color_discrete_sequence=["#F78FE4", "#1179f0"], opacity=0.60, histnorm = 'percent')
fig.show()

Percent successful matches:

In [29]:
df_match=df[["iid", "pid", "match"]]

In [30]:
np.sort(df_match.iloc[:, :3], axis = 1)

array([[  0.,   1.,  11.],
       [  0.,   1.,  12.],
       [  1.,   1.,  13.],
       ...,
       [  0., 528., 552.],
       [  0., 529., 552.],
       [  0., 530., 552.]])

In [31]:
len(np.sort(df_match.iloc[:, :3]))

8378

In [32]:
#np.sort(df_match.iloc[:, :2])

In [33]:
len(df_match)

8378

In [34]:
pd.DataFrame(np.sort(df_match.iloc[:, :3])).tail()

Unnamed: 0,0,1,2
8373,0.0,526.0,552.0
8374,0.0,527.0,552.0
8375,0.0,528.0,552.0
8376,0.0,529.0,552.0
8377,0.0,530.0,552.0


In [35]:
df_match = df_match.loc[~pd.DataFrame(np.sort(df_match.iloc[:, :3])).duplicated()]
#df_match.reset_index(inplace=True)

In [36]:
len(df_match)

4194

In [37]:
df_match.reset_index(drop=True, inplace=True)

In [38]:
len(df_match)

4194

In [39]:
df_match.head()

Unnamed: 0,iid,pid,match
0,1,11.0,0
1,1,12.0,0
2,1,13.0,1
3,1,14.0,1
4,1,15.0,1


In [40]:
df_match.tail()

Unnamed: 0,iid,pid,match
4189,530,548.0,0
4190,530,549.0,0
4191,530,550.0,0
4192,530,551.0,0
4193,530,552.0,0


In [41]:
df_match["match"].unique()

array([0, 1], dtype=int64)

In [42]:
df_match["match"] = df_match["match"].apply(lambda x: "Yes" if x ==1 else "No")

In [43]:
df_match.head(10)

Unnamed: 0,iid,pid,match
0,1,11.0,No
1,1,12.0,No
2,1,13.0,Yes
3,1,14.0,Yes
4,1,15.0,Yes
5,1,16.0,No
6,1,17.0,No
7,1,18.0,No
8,1,19.0,Yes
9,1,20.0,No


In [44]:
df_match.tail()

Unnamed: 0,iid,pid,match
4189,530,548.0,No
4190,530,549.0,No
4191,530,550.0,No
4192,530,551.0,No
4193,530,552.0,No


In [45]:
"""# pie plot
import plotly.express as px
# This dataframe has 244 lines, but 4 distinct values for `day`
df = px.data.tips()
fig = px.pie(df, values='tip', names='day')
fig.show()"""

"# pie plot\nimport plotly.express as px\n# This dataframe has 244 lines, but 4 distinct values for `day`\ndf = px.data.tips()\nfig = px.pie(df, values='tip', names='day')\nfig.show()"

In [46]:
import plotly.express as px
# This dataframe has 244 lines, but 4 distinct values for `day`
fig = px.pie(df_match, values="match")
fig.show()

Let's explore motivation for dating for men and women who participated in the study.
As the goals in the dataset are coded with numbers, let's create an additional column that will describe the goal explicitly according to the dataset key.

In [47]:
dating_per_person['goal_name'] = dating_per_person["goal"].apply(lambda x: "Seemed like a fun night out" if x == 1 else "To meet new people" if x == 2 else "To get a date" if x == 3 else "Looking for a serious relationship" if x == 4 else "To say I did it" if x==5 else "Other" if x==6 else None)
#dating_per_person.head(15)

NameError: name 'dating_per_person' is not defined

Let's represent different goals by a separate pie chart for both genders.

In [None]:
dating_females = dating_per_person.loc[dating_per_person['gender_name'] == 'Female']
#display(dating_females.head())
females_by_goal = dating_females["goal_name"].value_counts(dropna=False)
#display(females_by_goal)

females_by_goal_array = females_by_goal.to_numpy()


labels = 'Seemed like a fun night out', 'To meet new people', 'To get a date', 'Other', 'To say I did it', 'Looking for a serious relationship', 'N/A'
sizes = females_by_goal_array

plt.pie(sizes, labels=labels, autopct='%1.1f%%')
#plt.legend(bbox_to_anchor=(1, 1))

plt.show()


In [None]:
dating_males = dating_per_person.loc[dating_per_person['gender_name'] == 'Male']
#display(dating_males.head())
males_by_goal = dating_males["goal_name"].value_counts(dropna=False)
#display(males_by_goal)

males_by_goal_array = males_by_goal.to_numpy()

labels = 'Seemed like a fun night out', 'To meet new people', 'To get a date', 'Other', 'To say I did it', 'Looking for a serious relationship', 'N/A'
sizes = males_by_goal_array

plt.pie(sizes, labels=labels, autopct='%1.1f%%')
#plt.legend(bbox_to_anchor=(1, 1))

plt.show()


In [None]:
import numpy as np
females_by_goal_array = females_by_goal.to_numpy()

labels = 'Seemed like a fun night out', 'To meet new people', 'To get a date', 'Other', 'To say I did it', 'Looking for a serious relationship', 'N/A'
sizes = females_by_goal_array

plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.legend(bbox_to_anchor=(1, 1))


plt.show()

In [None]:
labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]

plt.pie(sizes, labels=labels, autopct='%1.1f%%')

plt.show()

In [None]:
#dating_per_person['note_method'] = dating_per_person["wave"].apply(lambda x: '10_scale' if x == 6.0 or 7.0 or 8.0 or 9.0 else "100_point_alloc")
dating_per_person['note_method'] = dating_per_person["wave"].apply(lambda x: '10_scale' if x == 6 or x == 7 or x == 8 or x == 9 else "100_point_alloc")

In [None]:
dating_per_person.head()

In [None]:
dating_per_person_10_scale_mask = dating_per_person["note_method"]=="10_scale"
dating_per_person_10_scale = dating_per_person[dating_per_person_10_scale_mask].reset_index(drop=False)
dating_per_person_10_scale.head()

In [None]:
dating_per_person_100_point_mask = dating_per_person["note_method"]=="100_point_alloc"
dating_per_person_100_point = dating_per_person[dating_per_person_100_point_mask].reset_index(drop=False)
dating_per_person_100_point.head()

In [None]:
# creating a dataframe that contaings estimates of criteria that are important in opposite sex:
dating_per_person_100_point = dating_per_person_100_point.groupby("gender_name").mean().reset_index(drop=False)
dating_per_person_100_point_by_gender = dating_per_person_100_point[["gender_name",'iid', 'match', 'attr1_1', 'sinc1_1',
       'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'attr1_s', 'sinc1_s',
       'intel1_s', 'fun1_s', 'amb1_s', 'shar1_s', 'attr1_2', 'sinc1_2', 'intel1_2', 'fun1_2', 'amb1_2', 'shar1_2',
       'attr1_3', 'sinc1_3', 'intel1_3', 'fun1_3', 'amb1_3', 'shar1_3']]
dating_per_person_100_point_by_gender

In [None]:
a = 23.737295 + 17.284344 + 20.490410 + 17.266721 + 10.174208 + 11.235574
print(a)