In [None]:
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm

from IPython.display import display, HTML

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

pd.set_option('display.max_columns', 500)

In [None]:
filepath = "./dating.csv"
df_full = pd.read_csv(filepath, encoding = "ISO-8859-1")

In [None]:
def write_features_to_file(features, file_name="current_features.txt"):
    with open(file_name, 'w') as f:
        for feat in features:
            f.write(feat + '\n')

In [None]:
def check_missing_col_value(col_name):
    print('-'*75)
    id_set = set(df_clean[df_clean[col_name].isnull()]["iid"])
    id_set2 = set(df_clean[df_clean[col_name].notnull()]["iid"])
    ids = []
    
    for x in id_set:
        if x in id_set2:
            ids.append(x)
    if len(ids) > 0:
        print(f"Following IDs have a value for column {col_name}: {ids}")
    else:
        print(f"No values for {col_name}")
    

In [None]:
def add_missing_data(df):
    pd.options.mode.chained_assignment = None  # default='warn'

    # missing ID of a single person
    df['id'].loc[(df['id'].isnull())] = 22
    
    # pid is missing for the same person in wave 5
    df['pid'].loc[(df['pid'].isnull())] = 118
    
    # adding missing code for lawyer and law
    df['career_c'].loc[(df['career_c'].isnull()) & (df['career']=="lawyer") | (df['career']=="law")  ] = 1
    
    # adding those careers to 'other' because only 1 person have them
    df['career_c'].loc[(df['career_c'].isnull()) & (df['career']=="tech professional") | (df['career']=="Economist")  ] = 15
    
    # add field value based on other participants with same field
    df['field_cd'].loc[(df['field_cd'].isnull()) & (df['field']=="Operations Research")  ] = 5
    
    # as suggested, making met and met_o a tertiary feature and fixing weird values
    df['met'].loc[(df.met.isnull())] = 0
    df['met_o'].loc[(df.met_o.isnull())] = 0
    df['met'].loc[ df.met>2] = 2
    df['met_o'].loc[ df.met_o>2] = 2
    
    # reading and gaming have values above 10, so we make them 10
    df['gaming'].loc[(df.gaming>10)] = 10
    df['reading'].loc[(df.reading>10)] = 10
    
    # these should be 0 because no points were left for the allocation of 100pt.
    attrs = ['fun1_1' ,'amb1_1'  ,'shar1_1']
    for attr in attrs:
        df[attr].loc[df.iid==130] = 0
    
    # these should be 0 because no points were left for the allocation of 100pt.
    attrs = ['amb2_1', 'shar2_1', 'shar1_1']
    for attr in attrs:
        df[attr].loc[ (df[attr].isnull())] = 0
    
    return df.copy()

In [None]:
def remove_rows(df_clean):
    ### Alternative is to use mean age / mean race?
    col_to_filt = ['age', 'race']
    mis_id = []

    # get iid and pid (which is the same) for missing age, age_o, race, race_o rows
    for col in col_to_filt:
        mis_id = list(set(mis_id + list(df_clean[df_clean[col].isna()].iid.values)))
        mis_id = list(set(mis_id + list(df_clean[df_clean[col + '_o'].isna()].pid.values)))
    print('Missing ages', len(mis_id))
    print(f'Original number of participants: {df_clean.shape[0]}')

    mis_id += [28] # another respondent with a lot of missing values
    mis_id += [414, 416] # missing 3_1 values
    
   
    # remove participants with missing data
    df_clean.drop(df_clean[df_clean.iid.isin(mis_id)].index, inplace=True)
    df_clean.drop(df_clean[df_clean.pid.isin(mis_id)].index, inplace=True)
    
    
    print(f"Dropping {len(mis_id)} participants")
    print(f"New number of participants: {df_clean.shape[0]}")
    print(f"Dropped shape: {df_clean.shape}")
    print('-'*50)

In [None]:
def replace_nan_by_mean(df, col):
    ## maybe take mean from wave, gender and not overall??
    mode = df[col].mode()
    if len(mode) > 1:
        mode = random.choice(mode) 
    df[col].loc[(df[col].isnull())] = mode[0]
    
    return df.copy()


In [None]:
def add_missing_attrs_by_mean(df, cols, filter_by_match=False):

    for col in cols:
        missing_pids = set(df[ df[col].isnull() ] ['pid'])

        for pid in list(missing_pids):
            if filter_by_match:
                mean_val = 0
                ## check if a match is available if not go to else block?
            else:
                mean_val = np.rint(df[df.iid==pid][col+'_o'].mean())
            df[col].loc[ (df_clean.pid==pid) & (df_clean[col].isnull()) ] = mean_val

        
    return df.copy()


In [None]:
print(f"Original shape: {df_full.shape}")

df_clean = df_full.copy()
df_clean = add_missing_data(df_clean)


interests = ['sports', 'tvsports', 'exercise', 'dining', 'museums', 'art',  'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga']
attr_list=['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']
attr_list_ex = ['like', 'prob', 'met']

attr1_list = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1',]
attr2_list = ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1',]
attr3_list = ['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']


df_clean = add_missing_attrs_by_mean(df_clean, (attr_list + attr_list_ex))

unwished_cols = ['pf_o_att' ,'pf_o_sin', 'pf_o_int' ,'pf_o_fun' ,'pf_o_amb', 'pf_o_sha' ] 
unwished_cols += ['attr_o', 'sinc_o' ,'intel_o', 'fun_o', 'amb_o', 'shar_o' ,'prob_o', 'match', 'dec', 'like_o']
unwished_cols += ['length', 'you_call', 'them_cal', 'match',] # post date variables + unrelated date variables
unwished_cols += ['positin1', 'undergra', 'mn_sat', 'tuition', 'zipcode', 'income', 'expnum', 'match_es'] # variables which have missing values that can not be replaced
unwished_cols += ['career', 'field'] # not needed since they are represented by number values + missing values were added

unwished_cols += [col for col in df_full.columns if '1_s' in col or '3_s' in col] # asked mid session, not clear when so we drop it
unwished_cols += [col for col in df_full.columns if '_2' in col]  # asked after the session
unwished_cols += [col for col in df_full.columns if '_3' in col]  # asked after the session
unwished_cols += [col for col in df_full.columns if '4_1' in col]  # missed in wave 6-21 which makes 21%
unwished_cols += [col for col in df_full.columns if '5_1' in col]  # missed in wave 1-9 which makes 40%

print(f"Dropping {len(unwished_cols)} columns")


df_clean.drop(unwished_cols, axis=1, inplace=True)

print(f"Dropped shape: {df_clean.shape}")
print('-'*50)

remove_rows(df_clean)


cols = ['career_c', 'date', ]
for col in cols:
    df_clean = replace_nan_by_mean(df_clean, col)


col_with_miss_values = []
for col in df_clean.columns:
    mis = df_clean[col].isnull().sum()
    if mis > 0:
        col_with_miss_values.append(col)
        print("{}: {} missing, {}%".format(col, mis, round(mis/df_full.shape[0] * 100, 3)))




for item in attr_list:
    df_clean[df_clean[item].isnull()]
    

for col in attr_list:
    ids = df_full[df_full[col].isnull()][['iid', 'pid']]
    for pid, iid in zip(ids['pid'], ids['iid']):
        w = df_full[(df_full.iid==pid) & (df_full.pid==iid) & (df_full[col+'_o'].notnull())][['iid', 'pid', 'attr', 'attr_o']]
        if len(w) > 0:
            print(True)

if len(col_with_miss_values)     :
    print("There are no missing values")
else:
    print("Following columns have missing values")
    print(col_with_miss_values)


unwished_cols = ['from', 'iid', 'id', 'idg', 'condtn', 'partner', 'pid', 'round'] # rest unnecessary columns

df_clean.drop(unwished_cols, axis=1, inplace=True)

In [None]:
#display(HTML(df_clean[(df_clean.attr.isnull())][['iid', 'pid', 'wave']+attr_list_ex].to_html()))

write_features_to_file(df_clean.columns.tolist())

In [None]:
#attr1 - 0,00X
#attr2 - 0,00X
#attr3 - 0,0X
#attr - 0,0X

rlist = ['int_corr', 'age', 'age_o', 'race', 'race_o', 'met', 'met_o']

X_ols = df_clean[rlist + attr_list]
y_ols = df_clean.dec_o

traits = sm.OLS(y_ols, X_ols)

results_traits = traits.fit()
results_traits.summary()

In [None]:
# preparing the data
X = df_clean[attr3_list + rlist]
y = df_clean.dec_o

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# logistic regression classification model
model = LogisticRegression(C=1, random_state=0)
lrc = model.fit(X_train, y_train)
predict_train_lrc = lrc.predict(X_train)
predict_test_lrc = lrc.predict(X_test)
print('Training Accuracy:', metrics.accuracy_score(y_train, predict_train_lrc))
print('Validation Accuracy:', metrics.accuracy_score(y_test, predict_test_lrc))

In [None]:
print(df_clean.columns.tolist())

## TODO
def pre_process_data(df):
    # scale 0-1 alles?
    # convert to int, many are floats
    pass