# Preparing the Speed Dating Dataset

As an entrepreneur, you are planning to launch a new dating app into the market. The key feature that will differentiate your app from other competitors will be your high performing user-matching algorithm. Before building this model, you have partnered with a speed dating company to collect data from real events. You just received the dataset from your partner company but realized it is not as clean as you expected; there are missing and incorrect values. Your task is to fix the main data quality issues in this dataset.

In [99]:
import pandas as pd
df = pd.read_csv('..\dataset\Speed_Dating_Data.csv')
df

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,22.0,1,44,2,21,22,14,10.0,5,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8374,552,22.0,1,44,2,21,22,13,10.0,4,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8375,552,22.0,1,44,2,21,22,19,10.0,10,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8376,552,22.0,1,44,2,21,22,3,10.0,16,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0


In [100]:
df.shape

(8378, 195)

In [101]:
# Check for duplicate rows
df.duplicated().sum()

0

In [102]:
# Check for duplicate rows by using for the identifier columns (iid, id, partner, and pid)
df.duplicated(subset=['iid', 'id', 'partner', 'pid']).sum()

0

In [103]:
# Check for unexpected values for the following numerical variables
cols_to_check = ['imprace', 'imprelig', 'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga']
for col in cols_to_check:
    print(col)
    print(f"min: {df[col].min()}, max: {df[col].max()}")

imprace
min: 0.0, max: 10.0
imprelig
min: 1.0, max: 10.0
sports
min: 1.0, max: 10.0
tvsports
min: 1.0, max: 10.0
exercise
min: 1.0, max: 10.0
dining
min: 1.0, max: 10.0
museums
min: 0.0, max: 10.0
art
min: 0.0, max: 10.0
hiking
min: 0.0, max: 10.0
gaming
min: 0.0, max: 14.0
clubbing
min: 0.0, max: 10.0
reading
min: 1.0, max: 13.0
tv
min: 1.0, max: 10.0
theater
min: 0.0, max: 10.0
movies
min: 0.0, max: 10.0
concerts
min: 0.0, max: 10.0
music
min: 1.0, max: 10.0
shopping
min: 1.0, max: 10.0
yoga
min: 0.0, max: 10.0


In [104]:
# Replace the identified incorrect values
# range should be between 1-10
for col in cols_to_check:
    df.loc[df[col] > 10, col] = 10
    df.loc[df[col] < 10, col] = 1

In [105]:
for col in cols_to_check:
    print(col)
    print(f"min: {df[col].min()}, max: {df[col].max()}")

imprace
min: 1.0, max: 10.0
imprelig
min: 1.0, max: 10.0
sports
min: 1.0, max: 10.0
tvsports
min: 1.0, max: 10.0
exercise
min: 1.0, max: 10.0
dining
min: 1.0, max: 10.0
museums
min: 1.0, max: 10.0
art
min: 1.0, max: 10.0
hiking
min: 1.0, max: 10.0
gaming
min: 1.0, max: 10.0
clubbing
min: 1.0, max: 10.0
reading
min: 1.0, max: 10.0
tv
min: 1.0, max: 10.0
theater
min: 1.0, max: 10.0
movies
min: 1.0, max: 10.0
concerts
min: 1.0, max: 10.0
music
min: 1.0, max: 10.0
shopping
min: 1.0, max: 10.0
yoga
min: 1.0, max: 10.0


In [106]:
# Check the data type
df.dtypes

iid           int64
id          float64
gender        int64
idg           int64
condtn        int64
             ...   
attr5_3     float64
sinc5_3     float64
intel5_3    float64
fun5_3      float64
amb5_3      float64
Length: 195, dtype: object

In [107]:
for col in df.columns:
    print(f"{col}: {df[col].dtype}")

iid: int64
id: float64
gender: int64
idg: int64
condtn: int64
wave: int64
round: int64
position: int64
positin1: float64
order: int64
partner: int64
pid: float64
match: int64
int_corr: float64
samerace: int64
age_o: float64
race_o: float64
pf_o_att: float64
pf_o_sin: float64
pf_o_int: float64
pf_o_fun: float64
pf_o_amb: float64
pf_o_sha: float64
dec_o: int64
attr_o: float64
sinc_o: float64
intel_o: float64
fun_o: float64
amb_o: float64
shar_o: float64
like_o: float64
prob_o: float64
met_o: float64
age: float64
field: object
field_cd: float64
undergra: object
mn_sat: float64
tuition: float64
race: float64
imprace: float64
imprelig: float64
from: object
zipcode: float64
income: float64
goal: float64
date: float64
go_out: float64
career: object
career_c: float64
sports: float64
tvsports: float64
exercise: float64
dining: float64
museums: float64
art: float64
hiking: float64
gaming: float64
clubbing: float64
reading: float64
tv: float64
theater: float64
movies: float64
concerts: float64
mu

In [108]:
# Change the data types to categorical for the columns that don't contain numerical values
num_cols = ['round', 'order', 'int_corr', 'age', 'mn_sat', 'income', 'expnum'] # numerical columns
cat_cols = df.drop(num_cols, axis=1).columns
cat_cols 

Index(['iid', 'id', 'gender', 'idg', 'condtn', 'wave', 'position', 'positin1',
       'partner', 'pid',
       ...
       'attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3', 'attr5_3',
       'sinc5_3', 'intel5_3', 'fun5_3', 'amb5_3'],
      dtype='object', length=188)

In [109]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [110]:
for col in df.columns:
    print(f"{col}: {df[col].dtype}")

iid: category
id: category
gender: category
idg: category
condtn: category
wave: category
round: int64
position: category
positin1: category
order: int64
partner: category
pid: category
match: category
int_corr: float64
samerace: category
age_o: category
race_o: category
pf_o_att: category
pf_o_sin: category
pf_o_int: category
pf_o_fun: category
pf_o_amb: category
pf_o_sha: category
dec_o: category
attr_o: category
sinc_o: category
intel_o: category
fun_o: category
amb_o: category
shar_o: category
like_o: category
prob_o: category
met_o: category
age: float64
field: category
field_cd: category
undergra: category
mn_sat: float64
tuition: category
race: category
imprace: category
imprelig: category
from: category
zipcode: category
income: float64
goal: category
date: category
go_out: category
career: category
career_c: category
sports: category
tvsports: category
exercise: category
dining: category
museums: category
art: category
hiking: category
gaming: category
clubbing: category
readi

In [111]:
# Check for any missing values for each numerical variable
df[num_cols].isna().sum()

round          0
order          0
int_corr     158
age           95
mn_sat      5245
income      4099
expnum      6578
dtype: int64

In [112]:
# Replace the missing values for each numerical variable with their corresponding mean or median
for col in num_cols:
    print(col)
    print(df[col].unique())

round
[10 16 19 18  5 20  9 21 14  8  6 11 15  7 22]
order
[ 4  3 10  5  7  6  1  2  8  9 11 15 12 16 13 14 18 17 19 20 21 22]
int_corr
[ 0.14  0.54  0.16  0.61  0.21  0.25  0.34  0.5   0.28 -0.36  0.29  0.18
  0.1  -0.21  0.32  0.73  0.6   0.07  0.11  0.39 -0.24 -0.14  0.09 -0.04
 -0.3  -0.26 -0.15 -0.47 -0.18  0.05  0.37  0.35  0.15 -0.19 -0.43  0.
 -0.17  0.08 -0.16  0.06 -0.05 -0.13 -0.06  0.33 -0.51  0.12  0.19  0.47
  0.03  0.46  0.43  0.52 -0.46 -0.27  0.59  0.31 -0.34 -0.03 -0.11  0.42
 -0.4  -0.23  0.17  0.68 -0.01 -0.35  0.3   0.65  0.24  0.41  0.49  0.01
  0.22 -0.08  0.27  0.44  0.62 -0.2  -0.02 -0.33 -0.52 -0.1   0.58 -0.57
 -0.31 -0.07 -0.32  0.04 -0.12  0.48 -0.22 -0.29  0.38  0.53 -0.38  0.02
 -0.28  0.13  0.2    nan -0.41 -0.44  0.51 -0.48  0.4   0.26  0.77 -0.49
 -0.25 -0.09  0.45 -0.39  0.83  0.57 -0.61  0.72 -0.37  0.23 -0.58  0.8
 -0.56  0.63 -0.63  0.71  0.36  0.56  0.55  0.76  0.69  0.79  0.9   0.67
  0.66  0.81  0.64  0.74  0.75  0.85 -0.42 -0.5  -0.59  0.7   0.

In [113]:
# The values of the int_corr column range between -1 and 1. It seems like they have been normalized. 
# Since there are no extreme values or outliers, you can impute the missing values with the mean of this variable.
m = df['int_corr'].median()
df['int_corr'].fillna(m, inplace=True)

In [114]:
# Other columns haven't been normalized and some of them have outliers. 
# This time, you are going to need to use their medians to fill in the missing values.
missing_num_cols = ['age', 'mn_sat', 'income', 'expnum']
for col in missing_num_cols :
    m = df[col].median()
    df[col].fillna(m, inplace=True)

In [115]:
df[num_cols].isna().sum()

round       0
order       0
int_corr    0
age         0
mn_sat      0
income      0
expnum      0
dtype: int64