In [176]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/speed_dating_transformed.csv', encoding='latin-1', index_col=0)
df.head(5)

Unnamed: 0,match,exphappy,samerace,hobby_diff_phys,hobby_diff_out,hobby_diff_in,same_goal,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,income_diff,age_diff,same_career,confidence,imprace,date_freq,out_freq
0,0,0.222222,0,0.291667,0.6,0.8,0,0.571036,0.597222,0.506579,0.504952,0.652184,0.634998,0.545455,0,0.00625,0.421053,1.0,0.0
1,0,0.222222,0,0.291667,0.533333,0.472727,0,0.958946,0.486111,0.348684,1.0,0.498729,1.0,0.090909,0,0.0275,0.105263,1.0,0.0
2,1,0.222222,1,0.375,0.5,0.581818,1,0.377081,0.597222,0.480263,0.414916,0.728912,0.411073,0.090909,0,0.0075,0.210526,1.0,0.0
3,1,0.222222,0,0.25,0.366667,0.509091,1,0.474059,0.347222,0.348684,1.0,0.30691,0.237391,0.181818,0,0.02125,0.105263,1.0,0.0
4,1,0.222222,0,0.083333,0.833333,0.654545,0,0.506384,0.375,0.480263,0.204832,0.345274,0.801737,0.272727,0,0.01375,0.210526,1.0,0.0


In [177]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8346 entries, 0 to 8345
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   match            8346 non-null   int64  
 1   exphappy         8346 non-null   float64
 2   samerace         8346 non-null   int64  
 3   hobby_diff_phys  8346 non-null   float64
 4   hobby_diff_out   8346 non-null   float64
 5   hobby_diff_in    8346 non-null   float64
 6   same_goal        8346 non-null   int64  
 7   attr_diff        8346 non-null   float64
 8   sinc_diff        8346 non-null   float64
 9   intel_diff       8346 non-null   float64
 10  fun_diff         8346 non-null   float64
 11  amb_diff         8346 non-null   float64
 12  income_diff      8346 non-null   float64
 13  age_diff         8346 non-null   float64
 14  same_career      8346 non-null   int64  
 15  confidence       8346 non-null   float64
 16  imprace          8346 non-null   float64
 17  date_freq     

In [178]:
# calculating entropy (H)

def entropy(feature, dataset):
    val_count={}
    prob_list = []
    entr_list = []
    tot_entropy = 0
    unique_list = dataset[feature].unique()
#     print("Unique values list is "+str(unique_list))

# finding unique values and their sum
    for uniq in unique_list:
        val_count[uniq] = (dataset[feature] == uniq).sum()
#     print("Count of each unique value: "+str(val_count))

# finding probabilities
    for val in val_count.values():
        prob_list.append(val/(len(dataset[feature])))
#     print("Probabilities list is "+str(prob_list))

# finding entropy for each type value in feature
    for prob in prob_list:
        entr_uniq = (-1)*prob*(np.log2(prob))
        entr_list.append(entr_uniq)
        
# calculating total entropy of a particular feature
    for each_entr in entr_list:
        tot_entropy += each_entr
    return tot_entropy


In [179]:
# calculating gini index (Gini)

def gini(feature, dataset):
    val_count={}
    prob_list = []
    prob_sqr = 0
    prob_sqr_list = []
    tot_sqr_prob = 0
    gini_index = 0
    unique_list = dataset[feature].unique()
#     print("Unique values list is "+str(unique_list))

# finding unique values and their sum
    for uniq in unique_list:
        val_count[uniq] = (dataset[feature] == uniq).sum()
#     print("Count of each unique value: "+str(val_count))

# finding probabilities
    for val in val_count.values():
        prob_list.append(val/(len(dataset[feature])))
#     print("Probabilities list is "+str(prob_list))

# finding prob squares for each type value in feature
    for prob in prob_list:
        prob_sqr = prob**2
        prob_sqr_list.append(prob_sqr)
#     print("Squared Probabilities list is "+str(prob_sqr_list))

# finding prob squares sum and the gini index
    for each_prob_sqr in prob_sqr_list:
        tot_sqr_prob += each_prob_sqr
    gini_index = 1 - tot_sqr_prob
    return gini_index


In [180]:
# finding impurity using entropy or gini 

def find_measure_impurity(feature, measure, dataset):

    if measure == 'entropy':
        impurity = entropy(feature, dataset)
    elif measure == 'gini':
        impurity = gini(feature, dataset)
    
    return impurity

In [181]:
# calculating information gain (IG)

def IG(feature, target, dataset, measure):
           
    target_entropy = find_measure_impurity(target, measure, dataset)
#     print('Target entropy is '+str(target_entropy))
    entropy_list = []
    weight_list = []

# calculating weights for partitions and their impurities
    for level in dataset[feature].unique():
        df_feature_level = dataset[dataset[feature] == level]
        entropy_level = find_measure_impurity(target, measure, df_feature_level)
        entropy_list.append(entropy_level)  
        weight_level = len(df_feature_level) / len(dataset)
        weight_list.append(weight_level)

#     print('impurity of partitions:', entropy_list)
#     print('weights of partitions:', weight_list)

    feature_rem_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
#     print('Remaining impurity of '+feature+' is '+ str(feature_rem_impurity))

# calculating information gain 
    info_gain = target_entropy - feature_rem_impurity

    return info_gain


In [182]:
# calculating gain ratio

def GR(feature, target, dataset, measure):
    entr = entropy(feature, dataset)
    info_gain = IG(feature, target, dataset, measure)
    gain_ratio = info_gain/entr
#     print("Information gain: "+str(info_gain)+" ; Entropy: "+str(entr)+" ; Gain ratio: "+str(gain_ratio))
    return gain_ratio


In [183]:
# finding gain matric 
def find_gain_metric(feature, target, dataset, measure, gain):

    if gain == 'IG':
        gain_value = IG(feature, target, dataset, measure)
    elif gain == 'GR':
        gain_value = GR(feature, target, dataset, measure)
    
    return gain_value


In [184]:
# Using IUFS (impurity-based univariate feature selection), which will select the most informative features with 
# a univariate filter feature selection schema

def IUFS(target, dataset, k, measure='entropy', gain='IG'):
    feature_gains = {}
    feature_list = []
    
# finding feature gains using measures given
    for col in dataset.drop(columns=target).columns:
        feature_gains[col] = find_gain_metric(col, target, dataset, measure, gain)
#     print(feature_gains)
    
# sorting the feature gains to select k best features 
    sorted_feature_gains = sorted(feature_gains.items(), key=lambda x:x[1], reverse=True)
    sorted_dict = dict(sorted_feature_gains)
#     print(sorted_dict)
    for val in range(k):
        feature_list = (list(sorted_dict)[0:k])
    return feature_list


In [185]:
# categorical features list
cat_features = ['samerace', 'same_goal', 'same_career', 'match']

# creating a dataframe using categorical features
cat_df = pd.DataFrame()
index_of_col = 0
for col in cat_features:
    cat_df.insert(index_of_col, col, df[col])
    index_of_col += 1
cat_df

Unnamed: 0,samerace,same_goal,same_career,match
0,0,0,0,0
1,0,0,0,0
2,1,1,0,1
3,0,1,0,1
4,0,0,0,1
...,...,...,...,...
8341,0,1,0,0
8342,0,1,0,0
8343,0,0,0,0
8344,0,0,0,0


In [186]:
# descriptive features list
desc_features = ['exphappy', 'samerace', 'hobby_diff_phys', 'hobby_diff_out',
       'hobby_diff_in', 'same_goal', 'attr_diff', 'sinc_diff', 'intel_diff',
       'fun_diff', 'amb_diff', 'income_diff', 'age_diff', 'same_career',
       'confidence', 'imprace', 'date_freq', 'out_freq']

# selecting features using IUFS function 
select_feats = IUFS('match', cat_df, 2, measure='gini', gain='GR')
print("Selected features list: ", select_feats)

# categorical descriptive features list
cat_descr_fts = ['samerace', 'same_goal', 'same_career']

# features to be dropped 
drop_feats = list(set(cat_descr_fts) - set(select_feats))
print("Features to be dropped: ", drop_feats)

# copied df
ufs_df = df.copy()
# new df created after dropping features
ufs_df = ufs_df.drop(columns=drop_feats)

ufs_df_desc_fts = list(set(desc_features) - set(drop_feats))
# print("Descriptive features after univariate FS : ", ufs_df_desc_fts)

ufs_df

Selected features list:  ['same_career', 'samerace']
Features to be dropped:  ['same_goal']


Unnamed: 0,match,exphappy,samerace,hobby_diff_phys,hobby_diff_out,hobby_diff_in,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,income_diff,age_diff,same_career,confidence,imprace,date_freq,out_freq
0,0,0.222222,0,0.291667,0.600000,0.800000,0.571036,0.597222,0.506579,0.504952,0.652184,0.634998,0.545455,0,0.006250,0.421053,1.000000,0.000000
1,0,0.222222,0,0.291667,0.533333,0.472727,0.958946,0.486111,0.348684,1.000000,0.498729,1.000000,0.090909,0,0.027500,0.105263,1.000000,0.000000
2,1,0.222222,1,0.375000,0.500000,0.581818,0.377081,0.597222,0.480263,0.414916,0.728912,0.411073,0.090909,0,0.007500,0.210526,1.000000,0.000000
3,1,0.222222,0,0.250000,0.366667,0.509091,0.474059,0.347222,0.348684,1.000000,0.306910,0.237391,0.181818,0,0.021250,0.105263,1.000000,0.000000
4,1,0.222222,0,0.083333,0.833333,0.654545,0.506384,0.375000,0.480263,0.204832,0.345274,0.801737,0.272727,0,0.013750,0.210526,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8341,0,0.222222,0,0.875000,0.600000,0.400000,0.635688,0.388889,0.769737,0.760054,0.575457,0.351007,0.090909,0,0.019341,0.315789,0.833333,0.333333
8342,0,0.222222,0,0.666667,0.133333,0.581818,1.000000,0.597222,0.190789,0.324880,0.537093,0.518025,0.272727,0,0.019341,0.315789,0.833333,0.333333
8343,0,0.222222,0,0.458333,0.766667,0.836364,1.000000,0.402778,0.756579,0.489946,0.537093,0.989016,0.181818,0,0.019341,0.473684,0.833333,0.333333
8344,0,0.222222,0,0.458333,0.433333,0.727273,0.587199,0.694444,0.651316,0.459934,0.460365,0.299082,0.454545,0,0.019341,0.263158,0.833333,0.333333


In [187]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# decision tree 
def model_DecisionTree(desc_features, target, dataframe):
    X_train, X_test, Y_train, Y_test = train_test_split(dataframe[desc_features], dataframe[target],  
                                                        test_size=0.25, random_state=0)

#     print(desc_features)

    dt_clf = DecisionTreeClassifier(max_depth = 2)
    dt_clf.fit(X_train, Y_train)

    y_pred = dt_clf.predict(X_test)
    # print(y_pred)

    accuracy = accuracy_score(Y_test, y_pred).round(3)
    print("Accuracy score using DecisionTreeClassifier: {}".format(accuracy))

    cm = confusion_matrix(Y_test, y_pred)
    cm_df = pd.DataFrame(cm)
    return cm_df

In [188]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# KNN classifier 
def model_kNN_clf(desc_features, target, dataframe,k):
    X_train, X_test, Y_train, Y_test = train_test_split(dataframe[desc_features], dataframe[target],  
                                                            test_size=0.25, random_state=0)

    nn_clf = KNeighborsClassifier(n_neighbors=k)
    nn_clf.fit(X_train, Y_train)

    y_pred = nn_clf.predict(X_test)
    print('kNN accuracy score: {:.5f}'.format(accuracy_score(y_pred, Y_test)) )

In [189]:
# decision tree classifier on original dataframe
model_DecisionTree(desc_features, 'match', df)


Accuracy score using DecisionTreeClassifier: 0.833


Unnamed: 0,0,1
0,1736,0
1,348,3


In [190]:
# KNN classifier on original dataframe
model_kNN_clf(desc_features, 'match', df, 5)

kNN accuracy score: 0.81984


In [191]:
# decision tree classifier on new dataframe after univariate FS using IUFS
model_DecisionTree(ufs_df_desc_fts, 'match', ufs_df)

Accuracy score using DecisionTreeClassifier: 0.833


Unnamed: 0,0,1
0,1736,0
1,348,3


In [192]:
# kNN classifier on new dataframe after univariate FS using IUFS
model_kNN_clf(ufs_df_desc_fts, 'match', ufs_df,5)

kNN accuracy score: 0.82702


In [193]:
# We see that the accuracy score has improved when we performed univariate FS using IUFS 
# So we drop the feature from the initial df

df = ufs_df
df

Unnamed: 0,match,exphappy,samerace,hobby_diff_phys,hobby_diff_out,hobby_diff_in,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,income_diff,age_diff,same_career,confidence,imprace,date_freq,out_freq
0,0,0.222222,0,0.291667,0.600000,0.800000,0.571036,0.597222,0.506579,0.504952,0.652184,0.634998,0.545455,0,0.006250,0.421053,1.000000,0.000000
1,0,0.222222,0,0.291667,0.533333,0.472727,0.958946,0.486111,0.348684,1.000000,0.498729,1.000000,0.090909,0,0.027500,0.105263,1.000000,0.000000
2,1,0.222222,1,0.375000,0.500000,0.581818,0.377081,0.597222,0.480263,0.414916,0.728912,0.411073,0.090909,0,0.007500,0.210526,1.000000,0.000000
3,1,0.222222,0,0.250000,0.366667,0.509091,0.474059,0.347222,0.348684,1.000000,0.306910,0.237391,0.181818,0,0.021250,0.105263,1.000000,0.000000
4,1,0.222222,0,0.083333,0.833333,0.654545,0.506384,0.375000,0.480263,0.204832,0.345274,0.801737,0.272727,0,0.013750,0.210526,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8341,0,0.222222,0,0.875000,0.600000,0.400000,0.635688,0.388889,0.769737,0.760054,0.575457,0.351007,0.090909,0,0.019341,0.315789,0.833333,0.333333
8342,0,0.222222,0,0.666667,0.133333,0.581818,1.000000,0.597222,0.190789,0.324880,0.537093,0.518025,0.272727,0,0.019341,0.315789,0.833333,0.333333
8343,0,0.222222,0,0.458333,0.766667,0.836364,1.000000,0.402778,0.756579,0.489946,0.537093,0.989016,0.181818,0,0.019341,0.473684,0.833333,0.333333
8344,0,0.222222,0,0.458333,0.433333,0.727273,0.587199,0.694444,0.651316,0.459934,0.460365,0.299082,0.454545,0,0.019341,0.263158,0.833333,0.333333


In [194]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Copying univariate feature selected dataframe to new df_cp
df_cp = ufs_df.copy()

# dropping match column 
df_cp = df_cp.drop(df_cp.columns[[0]],axis=1)
df_cp

Unnamed: 0,exphappy,samerace,hobby_diff_phys,hobby_diff_out,hobby_diff_in,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,income_diff,age_diff,same_career,confidence,imprace,date_freq,out_freq
0,0.222222,0,0.291667,0.600000,0.800000,0.571036,0.597222,0.506579,0.504952,0.652184,0.634998,0.545455,0,0.006250,0.421053,1.000000,0.000000
1,0.222222,0,0.291667,0.533333,0.472727,0.958946,0.486111,0.348684,1.000000,0.498729,1.000000,0.090909,0,0.027500,0.105263,1.000000,0.000000
2,0.222222,1,0.375000,0.500000,0.581818,0.377081,0.597222,0.480263,0.414916,0.728912,0.411073,0.090909,0,0.007500,0.210526,1.000000,0.000000
3,0.222222,0,0.250000,0.366667,0.509091,0.474059,0.347222,0.348684,1.000000,0.306910,0.237391,0.181818,0,0.021250,0.105263,1.000000,0.000000
4,0.222222,0,0.083333,0.833333,0.654545,0.506384,0.375000,0.480263,0.204832,0.345274,0.801737,0.272727,0,0.013750,0.210526,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8341,0.222222,0,0.875000,0.600000,0.400000,0.635688,0.388889,0.769737,0.760054,0.575457,0.351007,0.090909,0,0.019341,0.315789,0.833333,0.333333
8342,0.222222,0,0.666667,0.133333,0.581818,1.000000,0.597222,0.190789,0.324880,0.537093,0.518025,0.272727,0,0.019341,0.315789,0.833333,0.333333
8343,0.222222,0,0.458333,0.766667,0.836364,1.000000,0.402778,0.756579,0.489946,0.537093,0.989016,0.181818,0,0.019341,0.473684,0.833333,0.333333
8344,0.222222,0,0.458333,0.433333,0.727273,0.587199,0.694444,0.651316,0.459934,0.460365,0.299082,0.454545,0,0.019341,0.263158,0.833333,0.333333


In [201]:
# feature selection using RFE and evaluating results
def FS_RFE(desc_cols, target_col, no_of_fts):
    # splitting the df into desc and target
    X = desc_cols
    print("No of desc features: ", len(X.columns))
    Y = target_col

    # using logistic regression 
    model = LogisticRegression() 

    # performing recursive feature elimination to select features 
    rfe = RFE(model, n_features_to_select=no_of_fts)

    fit = rfe.fit(X, Y)

    print("Num of features being selected: %d"% fit.n_features_)
    # selected features list
    slct_fts = fit.get_feature_names_out(None)
    print("Feature names: %s"% slct_fts)
    print("Selected Features as bool: %s"% fit.support_) 
    print("Feature ranking: %s"% fit.ranking_)
    
    log_fs_desc_fts = slct_fts

    # accuracy and confusion matrix using decision tree classifier
    model_DecisionTree(log_fs_desc_fts, 'match', df)
    
    # accuracy and confusion matrix using KNN classifier
    model_kNN_clf(log_fs_desc_fts, 'match', df, 2)
    
    return slct_fts


In [202]:
for i in range(10,18):
    FS_RFE(df_cp, df['match'], i)


No of desc features:  17
Num of features being selected: 10
Feature names: ['exphappy' 'hobby_diff_in' 'sinc_diff' 'intel_diff' 'fun_diff' 'age_diff'
 'confidence' 'imprace' 'date_freq' 'out_freq']
Selected Features as bool: [ True False False False  True False  True  True  True False False  True
 False  True  True  True  True]
Feature ranking: [1 5 2 6 1 4 1 1 1 8 7 1 3 1 1 1 1]
Accuracy score using DecisionTreeClassifier: 0.833
kNN accuracy score: 0.81984
No of desc features:  17
Num of features being selected: 11
Feature names: ['exphappy' 'hobby_diff_phys' 'hobby_diff_in' 'sinc_diff' 'intel_diff'
 'fun_diff' 'age_diff' 'confidence' 'imprace' 'date_freq' 'out_freq']
Selected Features as bool: [ True False  True False  True False  True  True  True False False  True
 False  True  True  True  True]
Feature ranking: [1 4 1 5 1 3 1 1 1 7 6 1 2 1 1 1 1]
Accuracy score using DecisionTreeClassifier: 0.833
kNN accuracy score: 0.81936
No of desc features:  17
Num of features being selected: 1

In [203]:
# We see that when 15 features are selected we have the highest accuracy, 
# so we select 15 features and discard the remaining features and perform model selection and evaluation.
slct_fts = FS_RFE(df_cp, df['match'], 15)

No of desc features:  17
Num of features being selected: 15
Feature names: ['exphappy' 'samerace' 'hobby_diff_phys' 'hobby_diff_out' 'hobby_diff_in'
 'attr_diff' 'sinc_diff' 'intel_diff' 'fun_diff' 'age_diff' 'same_career'
 'confidence' 'imprace' 'date_freq' 'out_freq']
Selected Features as bool: [ True  True  True  True  True  True  True  True  True False False  True
  True  True  True  True  True]
Feature ranking: [1 1 1 1 1 1 1 1 1 3 2 1 1 1 1 1 1]
Accuracy score using DecisionTreeClassifier: 0.833
kNN accuracy score: 0.83565


In [204]:
for col in df.columns:
    if col not in slct_fts and col != 'match':
        df = df.drop(columns=col)

df.columns
# df.head()

Index(['match', 'exphappy', 'samerace', 'hobby_diff_phys', 'hobby_diff_out',
       'hobby_diff_in', 'attr_diff', 'sinc_diff', 'intel_diff', 'fun_diff',
       'age_diff', 'same_career', 'confidence', 'imprace', 'date_freq',
       'out_freq'],
      dtype='object')

In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8346 entries, 0 to 8345
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   match            8346 non-null   int64  
 1   exphappy         8346 non-null   float64
 2   samerace         8346 non-null   int64  
 3   hobby_diff_phys  8346 non-null   float64
 4   hobby_diff_out   8346 non-null   float64
 5   hobby_diff_in    8346 non-null   float64
 6   attr_diff        8346 non-null   float64
 7   sinc_diff        8346 non-null   float64
 8   intel_diff       8346 non-null   float64
 9   fun_diff         8346 non-null   float64
 10  age_diff         8346 non-null   float64
 11  same_career      8346 non-null   int64  
 12  confidence       8346 non-null   float64
 13  imprace          8346 non-null   float64
 14  date_freq        8346 non-null   float64
 15  out_freq         8346 non-null   float64
dtypes: float64(13), int64(3)
memory usage: 1.1 MB
