# Speed Dating

## Contents

1) Initial exploration of the dataset
2) Principal 
3) Testing underlying assumptions

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
pd.set_option("max_seq_items", None)
pd.set_option('display.max_rows', None, 'display.max_columns', None)


In [2]:
df = pd.read_csv("data\speed_dating_data.csv", encoding = "ISO-8859-1")


## General exploration of the dataset

In [3]:
df["age"].describe()

count    8283.000000
mean       26.358928
std         3.566763
min        18.000000
25%        24.000000
50%        26.000000
75%        28.000000
max        55.000000
Name: age, dtype: float64

As there are a lot of columns in the dataset, I printed out the names of columns to get an overview of the features of the dataset.

In [4]:
print(df.columns)

Index(['iid', 'id', 'gender', 'idg', 'condtn', 'wave', 'round', 'position',
       'positin1', 'order', 'partner', 'pid', 'match', 'int_corr', 'samerace',
       'age_o', 'race_o', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun',
       'pf_o_amb', 'pf_o_sha', 'dec_o', 'attr_o', 'sinc_o', 'intel_o', 'fun_o',
       'amb_o', 'shar_o', 'like_o', 'prob_o', 'met_o', 'age', 'field',
       'field_cd', 'undergra', 'mn_sat', 'tuition', 'race', 'imprace',
       'imprelig', 'from', 'zipcode', 'income', 'goal', 'date', 'go_out',
       'career', 'career_c', 'sports', 'tvsports', 'exercise', 'dining',
       'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv',
       'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga',
       'exphappy', 'expnum', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1',
       'amb1_1', 'shar1_1', 'attr4_1', 'sinc4_1', 'intel4_1', 'fun4_1',
       'amb4_1', 'shar4_1', 'attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1',
       'amb2_1', 'shar2_1', 'attr3_1', 'si

Let's check whether there are missing values in the dataset if any.

In [5]:

df.isnull().sum()

iid            0
id             1
gender         0
idg            0
condtn         0
wave           0
round          0
position       0
positin1    1846
order          0
partner        0
pid           10
match          0
int_corr     158
samerace       0
age_o        104
race_o        73
pf_o_att      89
pf_o_sin      89
pf_o_int      89
pf_o_fun      98
pf_o_amb     107
pf_o_sha     129
dec_o          0
attr_o       212
sinc_o       287
intel_o      306
fun_o        360
amb_o        722
shar_o      1076
like_o       250
prob_o       318
met_o        385
age           95
field         63
field_cd      82
undergra    3464
mn_sat      5245
tuition     4795
race          63
imprace       79
imprelig      79
from          79
zipcode     1064
income      4099
goal          79
date          97
go_out        79
career        89
career_c     138
sports        79
tvsports      79
exercise      79
dining        79
museums       79
art           79
hiking        79
gaming        79
clubbing      

## General characteristics of the dataset demographic.

Each participant in the study had a unique number ("iid"). The original dataset contains several rows for each iid, each row corresponding to a different partner that the person with the given iid met during speed dating. 

I grouped the dataset by the iid in order to count the number of unique participants in the study.

In [6]:
df_iid = df.groupby("iid").mean(numeric_only=True).reset_index(drop=False)

print(f"Total number of participants in this study: {len(df_iid)}")

Total number of participants in this study: 551


In the dataset, the column "gender" contains information whether the participant was male (1) or female (0). To avoid confusion, I chose to create a column "gender_name" where the gender of each participant is indicated explicitly as "Male" or "Female".

In [7]:
df_iid["gender_name"] = df_iid['gender'].apply(lambda x: "Male" if x==1 else "Female")
df_iid[['iid', 'gender', 'age', 'gender_name']].head()


Unnamed: 0,iid,gender,age,gender_name
0,1,0.0,21.0,Female
1,2,0.0,24.0,Female
2,3,0.0,25.0,Female
3,4,0.0,23.0,Female
4,5,0.0,21.0,Female


Each iid corresponds to a unique participant. I used the dataset grouped by iid to count the number of unique values in column "iid" to determine the total number of participants. I also used the dataset to count the number of men and women who participated in the study.

In [8]:
df_iid[df_iid["gender_name"] == "Male"].head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,partner,pid,match,int_corr,samerace,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field_cd,race,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1,amb5_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,attr1_s,sinc1_s,intel1_s,fun1_s,amb1_s,shar1_s,attr3_s,sinc3_s,intel3_s,fun3_s,amb3_s,satis_2,length,numdat_2,attr7_2,sinc7_2,intel7_2,fun7_2,amb7_2,shar7_2,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2,attr4_2,sinc4_2,intel4_2,fun4_2,amb4_2,shar4_2,attr2_2,sinc2_2,intel2_2,fun2_2,amb2_2,shar2_2,attr3_2,sinc3_2,intel3_2,fun3_2,amb3_2,attr5_2,sinc5_2,intel5_2,fun5_2,amb5_2,you_call,them_cal,date_3,numdat_3,num_in_3,attr1_3,sinc1_3,intel1_3,fun1_3,amb1_3,shar1_3,attr7_3,sinc7_3,intel7_3,fun7_3,amb7_3,shar7_3,attr4_3,sinc4_3,intel4_3,fun4_3,amb4_3,shar4_3,attr2_3,sinc2_3,intel2_3,fun2_3,amb2_3,shar2_3,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3,gender_name
10,11,1.0,1.0,2.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.0,0.056,0.6,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.4,5.6,8.8,8.2,5.4,6.4,5.222222,6.2,5.2,1.9,27.0,8.0,2.0,7.0,3.0,1.0,5.0,4.0,2.0,8.0,7.0,2.0,6.0,7.0,5.0,5.0,5.0,4.0,9.0,2.0,4.0,8.0,7.0,8.0,5.0,1.0,7.0,3.0,35.0,20.0,20.0,20.0,0.0,5.0,,,,,,,25.0,5.0,20.0,20.0,25.0,5.0,8.0,9.0,7.0,8.0,5.0,,,,,,0.0,6.8,7.8,7.2,7.6,7.1,5.1,6.4,3.5,1.9,0.0,,,,,,,,,,,,7.0,2.0,,,,,,,,19.51,17.07,17.07,17.07,12.2,17.07,,,,,,,,,,,,,8.0,8.0,8.0,6.0,5.0,,,,,,0.0,0.0,0.0,,,35.0,25.0,15.0,15.0,0.0,10.0,,,,,,,,,,,,,,,,,,,7.0,8.0,7.0,7.0,5.0,,,,,,Male
11,12,2.0,1.0,4.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.2,0.155,0.6,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.4,7.1,6.6,7.1,6.7,6.9,6.111111,6.9,4.2,1.5,22.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,7.0,9.0,8.0,7.0,6.0,3.0,3.0,5.0,6.0,6.0,4.0,7.0,7.0,9.0,5.0,5.0,7.0,20.0,60.0,0.0,0.0,40.0,0.0,0.0,,,,,,,25.0,15.0,25.0,20.0,15.0,0.0,9.0,9.0,9.0,10.0,9.0,,,,,,0.3,6.2,6.7,9.1,5.9,5.7,4.7,6.9,3.5,1.7,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Male
12,13,3.0,1.0,6.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.4,0.119,0.3,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.4,4.8,7.9,8.8,7.0,5.9,5.888889,6.4,5.111111,1.6,22.0,1.0,4.0,3.0,5.0,2.0,7.0,1.0,1.0,7.0,8.0,2.0,9.0,5.0,6.0,4.0,7.0,7.0,6.0,8.0,10.0,8.0,9.0,9.0,8.0,1.0,3.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,,,,,,,20.0,16.0,16.0,18.0,16.0,14.0,4.0,7.0,8.0,8.0,3.0,,,,,,1.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.625,,,,,,,,,,,,,10.0,3.0,1.0,,,,,,,20.0,20.0,20.0,20.0,7.5,12.5,,,,,,,,,,,,,3.0,7.0,7.0,8.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Male
13,14,4.0,1.0,8.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.8,-0.057,0.6,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.8,8.2,7.6,7.4,7.6,6.6,7.444444,7.7,5.5,1.6,23.0,1.0,2.0,1.0,1.0,2.0,4.0,1.0,1.0,10.0,6.0,8.0,8.0,3.0,3.0,10.0,8.0,8.0,6.0,7.0,3.0,10.0,6.0,8.0,6.0,1.0,8.0,15.0,30.0,5.0,15.0,40.0,5.0,5.0,,,,,,,20.0,15.0,20.0,20.0,5.0,20.0,9.0,9.0,9.0,9.0,9.0,,,,,,1.0,7.2,8.1,7.8,8.2,8.7,8.3,7.5,7.3,1.6,,,,,,,,,,,,,6.0,3.0,,,,,,,,20.0,14.29,17.14,22.86,14.29,11.43,,,,,,,,,,,,,9.0,9.0,9.0,9.0,9.0,,,,,,1.0,1.0,0.0,,,30.0,10.0,20.0,30.0,5.0,5.0,,,,,,,,,,,,,,,,,,,9.0,9.0,9.0,9.0,9.0,,,,,,Male
14,15,5.0,1.0,10.0,1.0,1.0,10.0,5.5,,5.5,5.5,5.5,0.3,0.052,0.0,23.6,3.0,20.409,14.318,23.227,22.318,10.318,9.409,0.3,5.7,7.6,7.7,7.2,6.5,6.666667,6.3,4.777778,1.7,24.0,1.0,3.0,3.0,1.0,1.0,4.0,1.0,1.0,9.0,7.0,9.0,7.0,4.0,3.0,6.0,7.0,9.0,8.0,6.0,9.0,9.0,6.0,7.0,2.0,1.0,5.0,9.0,30.0,10.0,20.0,10.0,10.0,20.0,,,,,,,20.0,20.0,20.0,20.0,10.0,10.0,7.0,7.0,7.0,9.0,9.0,,,,,,0.9,7.6,8.5,8.8,7.5,8.25,6.75,7.7,4.9,1.8,,,,,,,,,,,,,8.0,2.0,1.0,,,,,,,21.21,18.18,12.12,21.21,6.06,21.21,,,,,,,,,,,,,7.0,7.0,9.0,7.0,9.0,,,,,,3.0,0.0,0.0,,,30.0,10.0,20.0,20.0,5.0,15.0,,,,,,,,,,,,,,,,,,,7.0,7.0,9.0,7.0,9.0,,,,,,Male


In [9]:
nb_women = len(df_iid[df_iid["gender_name"] == "Female"])
nb_men = len(df_iid[df_iid["gender_name"] == "Male"])
print(f"There were {nb_men} male participants and {nb_women} female participants in the study.")


There were 277 male participants and 274 female participants in the study.


In [10]:
fig = px.box(df_iid, x="gender_name", y="age", color="gender_name", color_discrete_sequence=["#F78FE4", "#0460C9"])
fig.show()

In [11]:
type(df_iid[df_iid["gender_name"] == "Female"])

pandas.core.frame.DataFrame

In [12]:
import plotly.figure_factory as ff
import numpy as np

x1= df_iid[df_iid["gender_name"] == "Female"]["age"].dropna().copy()
x2= df_iid[df_iid["gender_name"] == "Male"]["age"].dropna().copy()
x1 = x1.to_numpy()
x2 = x2.to_numpy()

hist_data = [x1, x2]

group_labels = ['Female', 'Male']
colors = ["#F78FE4", "#0460C9"]

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=1, show_rug=False)

# Add title
fig.update_layout(title_text='Hist and Curve Plot')
fig.show()

In [13]:
import plotly.figure_factory as ff
import numpy as np

x1 = np.random.randn(200) - 2
x2 = np.random.randn(200)
x3 = np.random.randn(200) + 2

hist_data = [x1, x2, x3]

group_labels = ['Group 1', 'Group 2', 'Group 3']
colors = ['#A56CC1', '#A6ACEC', '#63F5EF']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=.2, show_rug=False)

# Add title
fig.update_layout(title_text='Hist and Curve Plot')
fig.show()

In [14]:

fig = px.histogram(df_iid, x="age",
             color='gender_name', barmode='group', color_discrete_sequence=["#F78FE4", "#1179f0"])
fig.show()

Education field:

In [15]:
len(df.columns)

195

In [16]:
len(df_iid.columns)

188

The initial dataset contained columns "field" and "field_cd". After groupby, only the numerical columns were left in the new dataframe, so I decided to recreate a column containing the name of the field of study.

In [17]:
df_iid['field_name'] = df_iid["field_cd"].apply(lambda x: "Law" if x == 1 
                                                    else "Math" if x == 2 
                                                    else "Social Science / Psychology" if x == 3 
                                                    else "Medical Science / Pharma / Biotech" if x == 4 
                                                    else "Engineering" if x==5
                                                    else "English / Creative Writing / Journalism" if x == 6 
                                                    else "History / Religion / Philosopy" if x == 7 
                                                    else "Business / Economy / Finance" if x == 8 
                                                    else "Education, Academia" if x==9
                                                    else "Biology / Chemistry / Physics" if x==10
                                                    else "Social Work" if x == 11
                                                    else "Undergrad / Undecided" if x == 12 
                                                    else "Political Science and International Affairs" if x == 13 
                                                    else "Film" if x==14
                                                    else "Fine Arts / Arts Administration" if x == 15 
                                                    else "Languages" if x == 16
                                                    else "Architecture" if x==17 
                                                    else "Other")

In [18]:
import plotly.express as px
fig = px.histogram(df_iid, y="field_name", opacity = 0.75)
fig.show()

In [19]:
import plotly.express as px
fig = px.histogram(df_iid, y="field_name", opacity = 0.75).update_yaxes(categoryorder = "total ascending")
fig.show()

In [20]:
import plotly.express as px
fig = px.histogram(df_iid, y="field_name", color = "field_cd", opacity = 0.75).update_yaxes(categoryorder = "total ascending")
fig.show()

In [21]:
fig = px.histogram(df_iid, y="field_name",
             color='field_name', opacity = 0.75)
fig.show()

To have a more general view of the participants in the study, I wanted to: 
1) Regroup some fields of study chosen by the authors of the experiment into more general categories. I chose to group them according to the system used by the French ONISEP ("Office national d'information sur les enseignements et les professions") so that the categores were more understandable for a reader from France.
2) For each of the fields that were obtained in this manner, percentage of male and female students was calculated.

In [22]:
df_iid['field_name'] = df_iid["field_cd"].apply(lambda x: "Law" if x == 1 
                                                    else "Math" if x == 2 
                                                    else "Social Science / Psychology" if x == 3 
                                                    else "Medical Science / Pharma / Biotech" if x == 4 
                                                    else "Engineering" if x==5
                                                    else "English / Creative Writing / Journalism" if x == 6 
                                                    else "History / Religion / Philosopy" if x == 7 
                                                    else "Business / Economy / Finance" if x == 8 
                                                    else "Education, Academia" if x==9
                                                    else "Biology / Chemistry / Physics" if x==10
                                                    else "Social Work" if x == 11
                                                    else "Undergrad / Undecided" if x == 12 
                                                    else "Political Science and International Affairs" if x==13 
                                                    else "Film" if x==14
                                                    else "Fine Arts / Arts Administration" if x==15 
                                                    else "Languages" if x == 16
                                                    else "Architecture" if x==17 
                                                    else "Other")

In [23]:
df_iid['domain_of_study'] = df_iid["field_cd"].apply(lambda x: "Arts" if x==14 or x==15
                                                    else "Economy and Management" if x==8
                                                    else "Sciences" if x==2 or x==5 or x ==10
                                                    else "Medicine, Pharma, Biotech" if x==4
                                                    else "Education, Academia" if x==9
                                                    else "Law and Political Science" if x==1 or x==13
                                                    else "Literature and Languages" if x==6 or x ==16
                                                    else "Social Sciences, Humanities, Psychology" if x==3 or x==7
                                                    else "Education, Academia" if x==9
                                                    else "Social Work" if x==11
                                                    else "Architecture" if x==17
                                                    else "Undergrad / Undecided" if x==12
                                                    else "Other")

In [25]:
fig = px.histogram(df_iid, y="domain_of_study", opacity = 0.75).update_yaxes(categoryorder = "total ascending")
fig.show()

In [27]:
fig = px.histogram(df_iid, y="domain_of_study", color_discrete_sequence=["#F78FE4", "#1179f0"], opacity = 0.75, color = "gender_name").update_yaxes(categoryorder = "total ascending")
fig.show()

Income:

The authors of the experiment did not ask for income directly but inferred it from the zipcode of the area where the participant grew up. Therefore the income provided in the dataset for each person is in fact the median household income in the area where the person grew up (according to the US Census Bureau data at the time of the experiment).


Motivation:

The authors asked each participant to indicate their goal for participating in the speed dating experiment.

In [None]:
df_iid['participant_goal'] = df_iid["goal"].apply(lambda x: "Seemed like a fun night out" if x == 1 
                                                    else "To meet new people" if x == 2 
                                                    else "To get a date" if x == 3 
                                                    else "Looking for a serious relationship" if x == 4 
                                                    else "To say I did it" if x==5 
                                                    else "Other")

In [None]:
import plotly.express as px
fig = px.histogram(df_iid, y="participant_goal",
             color='gender_name', barmode='group', color_discrete_sequence=["#F78FE4", "#1179f0"], opacity=0.60, histnorm = 'percent')
fig.show()

In [None]:
fig = px.histogram(df_iid, y="participant_goal",
             color='gender_name', barmode='group', color_discrete_sequence=["#F78FE4", "#1179f0"], opacity=0.60, histnorm = 'percent')
fig.show()

Percent successful matches:

In [None]:
df_match=df[["iid", "pid", "match"]]

In [None]:
np.sort(df_match.iloc[:, :3], axis = 1)

array([[  0.,   1.,  11.],
       [  0.,   1.,  12.],
       [  1.,   1.,  13.],
       ...,
       [  0., 528., 552.],
       [  0., 529., 552.],
       [  0., 530., 552.]])

In [None]:
len(np.sort(df_match.iloc[:, :3]))

8378

In [None]:
#np.sort(df_match.iloc[:, :2])

In [None]:
len(df_match)

8378

In [None]:
pd.DataFrame(np.sort(df_match.iloc[:, :3])).tail()

Unnamed: 0,0,1,2
8373,0.0,526.0,552.0
8374,0.0,527.0,552.0
8375,0.0,528.0,552.0
8376,0.0,529.0,552.0
8377,0.0,530.0,552.0


In [None]:
df_match = df_match.loc[~pd.DataFrame(np.sort(df_match.iloc[:, :3])).duplicated()]
#df_match.reset_index(inplace=True)

In [None]:
len(df_match)

4194

In [None]:
df_match.reset_index(drop=True, inplace=True)

In [None]:
len(df_match)

4194

In [None]:
df_match.head()

Unnamed: 0,iid,pid,match
0,1,11.0,0
1,1,12.0,0
2,1,13.0,1
3,1,14.0,1
4,1,15.0,1


In [None]:
df_match.tail()

Unnamed: 0,iid,pid,match
4189,530,548.0,0
4190,530,549.0,0
4191,530,550.0,0
4192,530,551.0,0
4193,530,552.0,0


In [None]:
df_match["match"].unique()

array([0, 1], dtype=int64)

In [None]:
df_match["match"] = df_match["match"].apply(lambda x: "Yes" if x ==1 else "No")

In [None]:
df_match.head(10)

Unnamed: 0,iid,pid,match
0,1,11.0,No
1,1,12.0,No
2,1,13.0,Yes
3,1,14.0,Yes
4,1,15.0,Yes
5,1,16.0,No
6,1,17.0,No
7,1,18.0,No
8,1,19.0,Yes
9,1,20.0,No


In [None]:
df_match.tail()

Unnamed: 0,iid,pid,match
4189,530,548.0,No
4190,530,549.0,No
4191,530,550.0,No
4192,530,551.0,No
4193,530,552.0,No


In [None]:
"""# pie plot
import plotly.express as px
# This dataframe has 244 lines, but 4 distinct values for `day`
df = px.data.tips()
fig = px.pie(df, values='tip', names='day')
fig.show()"""

In [None]:
import plotly.express as px
# This dataframe has 244 lines, but 4 distinct values for `day`
fig = px.pie(df_match, values="match")
fig.show()

Let's explore motivation for dating for men and women who participated in the study.
As the goals in the dataset are coded with numbers, let's create an additional column that will describe the goal explicitly according to the dataset key.

In [None]:
dating_per_person['goal_name'] = dating_per_person["goal"].apply(lambda x: "Seemed like a fun night out" if x == 1 else "To meet new people" if x == 2 else "To get a date" if x == 3 else "Looking for a serious relationship" if x == 4 else "To say I did it" if x==5 else "Other" if x==6 else None)
#dating_per_person.head(15)

NameError: name 'dating_per_person' is not defined

Let's represent different goals by a separate pie chart for both genders.

In [None]:
dating_females = dating_per_person.loc[dating_per_person['gender_name'] == 'Female']
#display(dating_females.head())
females_by_goal = dating_females["goal_name"].value_counts(dropna=False)
#display(females_by_goal)

females_by_goal_array = females_by_goal.to_numpy()


labels = 'Seemed like a fun night out', 'To meet new people', 'To get a date', 'Other', 'To say I did it', 'Looking for a serious relationship', 'N/A'
sizes = females_by_goal_array

plt.pie(sizes, labels=labels, autopct='%1.1f%%')
#plt.legend(bbox_to_anchor=(1, 1))

plt.show()


In [None]:
dating_males = dating_per_person.loc[dating_per_person['gender_name'] == 'Male']
#display(dating_males.head())
males_by_goal = dating_males["goal_name"].value_counts(dropna=False)
#display(males_by_goal)

males_by_goal_array = males_by_goal.to_numpy()

labels = 'Seemed like a fun night out', 'To meet new people', 'To get a date', 'Other', 'To say I did it', 'Looking for a serious relationship', 'N/A'
sizes = males_by_goal_array

plt.pie(sizes, labels=labels, autopct='%1.1f%%')
#plt.legend(bbox_to_anchor=(1, 1))

plt.show()


In [None]:
import numpy as np
females_by_goal_array = females_by_goal.to_numpy()

labels = 'Seemed like a fun night out', 'To meet new people', 'To get a date', 'Other', 'To say I did it', 'Looking for a serious relationship', 'N/A'
sizes = females_by_goal_array

plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.legend(bbox_to_anchor=(1, 1))


plt.show()

In [None]:
labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]

plt.pie(sizes, labels=labels, autopct='%1.1f%%')

plt.show()

In [None]:
#dating_per_person['note_method'] = dating_per_person["wave"].apply(lambda x: '10_scale' if x == 6.0 or 7.0 or 8.0 or 9.0 else "100_point_alloc")
dating_per_person['note_method'] = dating_per_person["wave"].apply(lambda x: '10_scale' if x == 6 or x == 7 or x == 8 or x == 9 else "100_point_alloc")

In [None]:
dating_per_person.head()

In [None]:
dating_per_person_10_scale_mask = dating_per_person["note_method"]=="10_scale"
dating_per_person_10_scale = dating_per_person[dating_per_person_10_scale_mask].reset_index(drop=False)
dating_per_person_10_scale.head()

In [None]:
dating_per_person_100_point_mask = dating_per_person["note_method"]=="100_point_alloc"
dating_per_person_100_point = dating_per_person[dating_per_person_100_point_mask].reset_index(drop=False)
dating_per_person_100_point.head()

In [None]:
# creating a dataframe that contaings estimates of criteria that are important in opposite sex:
dating_per_person_100_point = dating_per_person_100_point.groupby("gender_name").mean().reset_index(drop=False)
dating_per_person_100_point_by_gender = dating_per_person_100_point[["gender_name",'iid', 'match', 'attr1_1', 'sinc1_1',
       'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'attr1_s', 'sinc1_s',
       'intel1_s', 'fun1_s', 'amb1_s', 'shar1_s', 'attr1_2', 'sinc1_2', 'intel1_2', 'fun1_2', 'amb1_2', 'shar1_2',
       'attr1_3', 'sinc1_3', 'intel1_3', 'fun1_3', 'amb1_3', 'shar1_3']]
dating_per_person_100_point_by_gender

In [None]:
a = 23.737295 + 17.284344 + 20.490410 + 17.266721 + 10.174208 + 11.235574
print(a)