# Libraries

In [1]:
import os
import pandas as pd
import swifter
import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
print(os.getcwd())

/Users/bhaskarn/Downloads/VA_Data/src


# Custom functions 

In [3]:
# get name of dataframe (which is required)
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

# get shape of dataframe
def print_shape(df):
    df_name = get_df_name(df)
    print(f'shape of the df({df_name}):', df.shape)
    
# Print classification report
from sklearn.metrics import classification_report
def classification_report_train_test(y_train, train_pred, y_valid, valid_pred):

    print('''
            =========================================
               CLASSIFICATION REPORT FOR TRAIN DATA
            =========================================
            ''')
    print(classification_report(y_train, train_pred))

    print('''
            =========================================
            CLASSIFICATION REPORT FOR VALIDATION DATA
            =========================================
            ''')
    print(classification_report(y_valid, valid_pred))


# Data sourcing

#### Read all the csv files

In [4]:
data_location = '~/Downloads/VA_Data/Datasets/'
train_raw = pd.read_csv(data_location + 'Train.csv')
camp_raw = pd.read_csv(data_location + 'Health_Camp_Detail.csv')
patient_raw = pd.read_csv(data_location + 'Patient_Profile.csv')
first_camp = pd.read_csv(data_location + 'First_Health_Camp_Attended.csv')
second_camp = pd.read_csv(data_location + 'Second_Health_Camp_Attended.csv')
third_camp = pd.read_csv(data_location + 'Third_Health_Camp_Attended.csv')
test_raw = pd.read_csv(data_location + 'test_l0Auv8Q.csv')

In [5]:
print_shape(train_raw)
print_shape(camp_raw)
print_shape(patient_raw)
print_shape(first_camp)
print_shape(second_camp)
print_shape(third_camp)
print_shape(test_raw)

shape of the df(train_raw): (75278, 8)
shape of the df(camp_raw): (65, 6)
shape of the df(patient_raw): (37633, 11)
shape of the df(first_camp): (6218, 5)
shape of the df(second_camp): (7819, 3)
shape of the df(third_camp): (6515, 4)
shape of the df(test_raw): (35249, 8)


# Understand the data

#### Understand Train.csv

In [6]:
train_raw.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,489652,6578,10-Sep-05,4,0,0,0,2
1,507246,6578,18-Aug-05,45,5,0,0,7
2,523729,6534,29-Apr-06,0,0,0,0,0
3,524931,6535,07-Feb-04,0,0,0,0,0
4,521364,6529,28-Feb-06,15,1,0,0,7


In [7]:
# check for missing values
train_raw.isna().sum()

Patient_ID             0
Health_Camp_ID         0
Registration_Date    334
Var1                   0
Var2                   0
Var3                   0
Var4                   0
Var5                   0
dtype: int64

In [8]:
# we have missing vales in Registration_Date
# let's check if the patient_ids are unique
train_raw.Patient_ID.nunique()

29828

In [9]:
# no of unique patient_ids are different form no. of rows in train_raw
# let's investigate little deeper
train_raw.Patient_ID.value_counts()

516956    32
490196    28
513633    28
509188    28
517006    25
          ..
507996     1
512094     1
514143     1
524388     1
524288     1
Name: Patient_ID, Length: 29828, dtype: int64

In [10]:
# we have duplicate patient_ids, now it's clear why no of unique patient_ids are less
# let's try to understand why we have duplicate patient_ids
patient_516956_data = train_raw.loc[train_raw['Patient_ID'] == 516956]
patient_516956_data

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
34,516956,6541,19-Dec-05,7,4,0,0,5
389,516956,6587,03-Aug-05,6,4,0,0,5
5002,516956,6540,31-Oct-04,0,0,0,0,0
7279,516956,6539,06-Aug-04,0,0,0,0,0
8557,516956,6535,03-Dec-03,0,0,0,0,0
9211,516956,6554,08-May-05,6,4,0,0,5
9260,516956,6581,02-Jan-04,0,0,0,0,0
12151,516956,6543,11-Nov-06,7,4,0,0,5
16488,516956,6578,20-Aug-05,6,4,0,0,5
20738,516956,6580,19-Oct-04,4,1,0,0,4


In [11]:
# it is possible few patients attend camp regularly
# let's see if all the instaces are unique (check by date)
print(patient_516956_data.Registration_Date.nunique()==patient_516956_data.shape[0])

False


In [12]:
# looks like we have duplicate registration dates as well
# lets check the shape of this particular patients data
patient_516956_data.shape

(32, 8)

In [13]:
# now lets check how many unique visits he has paid
patient_516956_data.Registration_Date.nunique()

30

In [14]:
# alright, registration dates are not matching shape of the patient data
# lets check for this patient's duplicate registration dates
pd.concat(g for _, g in patient_516956_data.groupby('Registration_Date') if len(g) > 1)

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
33173,516956,6532,24-Feb-05,4,1,0,0,4
42618,516956,6523,24-Feb-05,4,1,0,0,4
5002,516956,6540,31-Oct-04,0,0,0,0,0
40731,516956,6526,31-Oct-04,4,1,0,0,4


In [15]:
# therefore, there could be multiple registrations by same individual
# let's check the attendance in each camp

camp_dfs = [first_camp, second_camp, third_camp]
for df in camp_dfs:
    try:
        pd.concat(i for _, i in df.groupby(['Patient_ID', 'Health_Camp_ID']) if len(i) > 1)
    except ValueError as v:
        print(str(v), f'in {get_df_name(df)}')
    
# it is evident that there are no patients that have gone to camp twice
# which is what we wanted

No objects to concatenate in first_camp
No objects to concatenate in second_camp
No objects to concatenate in third_camp


# Data preparation

In [16]:
train_raw.describe()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5
count,75278.0,75278.0,75278.0,75278.0,75278.0,75278.0,75278.0
mean,507233.32566,6547.597319,0.796156,0.238556,0.000385,0.022424,0.248386
std,12404.734886,19.272983,7.626687,3.769594,0.019624,0.19957,1.213938
min,485679.0,6523.0,0.0,0.0,0.0,0.0,0.0
25%,496505.25,6534.0,0.0,0.0,0.0,0.0,0.0
50%,507276.5,6541.0,0.0,0.0,0.0,0.0,0.0
75%,517919.0,6562.0,0.0,0.0,0.0,0.0,0.0
max,528657.0,6587.0,288.0,156.0,1.0,4.0,31.0


In [17]:
camp_raw.describe(include='object')

Unnamed: 0,Camp_Start_Date,Camp_End_Date,Category1,Category2
count,65,65,65,65
unique,58,54,3,7
top,16-Aug-03,07-Nov-07,First,C
freq,2,5,44,16


In [18]:
# lets check what those unique values are
print(camp_raw.Category1.unique().tolist())
print(camp_raw.Category2.unique().tolist())

['First', 'Second', 'Third']
['B', 'C', 'F', 'E', 'D', 'G', 'A']


In [19]:
# merge train and health camp data on Health_Camp_ID
merged_train = train_raw.merge(camp_raw, on='Health_Camp_ID').drop(['Camp_Start_Date','Camp_End_Date','Registration_Date'], axis=1)
merged_train.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3
0,489652,6578,4,0,0,0,2,Third,G,2
1,507246,6578,45,5,0,0,7,Third,G,2
2,491371,6578,0,0,0,0,0,Third,G,2
3,485995,6578,0,0,0,0,0,Third,G,2
4,511519,6578,0,0,0,0,0,Third,G,2


In [20]:
# do the same for test
merged_test = test_raw.merge(camp_raw, on='Health_Camp_ID').drop(['Camp_Start_Date','Camp_End_Date','Registration_Date'], axis=1)
merged_test.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3
0,505701,6548,1,0,0,0,2,Third,G,2
1,494067,6548,0,0,0,0,0,Third,G,2
2,499854,6548,0,0,0,0,0,Third,G,2
3,509140,6548,0,0,0,0,0,Third,G,2
4,486481,6548,0,0,0,0,0,Third,G,2


In [21]:
print(first_camp.head())
print('\n')
print(second_camp.head())
third_camp.head()

   Patient_ID  Health_Camp_ID  Donation  Health_Score  Unnamed: 4
0      506181            6560        40      0.439024         NaN
1      494977            6560        20      0.097561         NaN
2      518680            6560        10      0.048780         NaN
3      509916            6560        30      0.634146         NaN
4      488006            6560        20      0.024390         NaN


   Patient_ID  Health_Camp_ID  Health Score
0      526631            6536      0.875136
1      509122            6536      0.755700
2      498864            6536      0.673181
3      515398            6536      0.722041
4      504624            6536      0.464712


Unnamed: 0,Patient_ID,Health_Camp_ID,Number_of_stall_visited,Last_Stall_Visited_Number
0,517875,6527,3,1
1,504692,6578,1,1
2,504692,6527,3,1
3,493167,6527,4,4
4,510954,6528,2,2


In [22]:
# merge merged_train with 3 camps info dfs on 'Patient_ID','Health_Camp_ID'
merged_train_1 = merged_train[merged_train.Category1=='First'].merge(first_camp.drop('Unnamed: 4', axis=1), 'left', on=['Patient_ID', 'Health_Camp_ID'])
merged_train_2 = merged_train[merged_train.Category1=='Second'].merge(second_camp, 'left', on=['Patient_ID', 'Health_Camp_ID'])
merged_train_3 = merged_train[merged_train.Category1=='Third'].merge(third_camp, 'left', on=['Patient_ID', 'Health_Camp_ID'])
print_shape(merged_train_1)
print_shape(merged_train_2)
print_shape(merged_train_3)

shape of the df(merged_train_1): (49892, 12)
shape of the df(merged_train_2): (15114, 11)
shape of the df(merged_train_3): (10272, 12)


In [23]:
merged_train_1.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,Donation,Health_Score
0,524931,6535,0,0,0,0,0,First,E,2,,
1,500733,6535,0,0,0,0,0,First,E,2,,
2,499882,6535,0,0,0,0,0,First,E,2,,
3,504568,6535,3,1,0,0,3,First,E,2,10.0,0.634921
4,518865,6535,0,0,0,0,0,First,E,2,,


# Feature extraction

In [24]:
merged_train_1['outcome_favourable'] = 1
merged_train_1.loc[merged_train_1.Health_Score.isna(),'outcome_favourable']=0
merged_train_1.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,Donation,Health_Score,outcome_favourable
0,524931,6535,0,0,0,0,0,First,E,2,,,0
1,500733,6535,0,0,0,0,0,First,E,2,,,0
2,499882,6535,0,0,0,0,0,First,E,2,,,0
3,504568,6535,3,1,0,0,3,First,E,2,10.0,0.634921,1
4,518865,6535,0,0,0,0,0,First,E,2,,,0


In [25]:
merged_train_2['outcome_favourable'] = 1
merged_train_2.loc[merged_train_2['Health Score'].isna(),'outcome_favourable']=0
merged_train_2.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,Health Score,outcome_favourable
0,523729,6534,0,0,0,0,0,Second,A,2,0.402054,1
1,506153,6534,0,0,0,0,0,Second,A,2,0.402054,1
2,508986,6534,0,0,0,0,0,Second,A,2,0.53278,1
3,504679,6534,0,0,0,0,0,Second,A,2,0.819905,1
4,486432,6534,0,0,0,0,0,Second,A,2,,0


In [26]:
merged_train_3['outcome_favourable'] = 1
merged_train_3.loc[merged_train_3['Number_of_stall_visited'] == 0, 'outcome_favourable'] = 0
merged_train_3.loc[merged_train_3['Number_of_stall_visited'].isna(),'outcome_favourable'] = 0
merged_train_3.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,Number_of_stall_visited,Last_Stall_Visited_Number,outcome_favourable
0,489652,6578,4,0,0,0,2,Third,G,2,2.0,1.0,1
1,507246,6578,45,5,0,0,7,Third,G,2,,,0
2,491371,6578,0,0,0,0,0,Third,G,2,,,0
3,485995,6578,0,0,0,0,0,Third,G,2,3.0,3.0,1
4,511519,6578,0,0,0,0,0,Third,G,2,1.0,1.0,1


In [27]:
train_dfs = [merged_train_1, merged_train_2, merged_train_3]
common_cols = list(set.intersection(*(set(df.columns) for df in train_dfs)))
merged_with_outcome = pd.concat([df[common_cols] for df in train_dfs], ignore_index=True)
ordered_cols = list(merged_train.columns)
ordered_cols.append('outcome_favourable')
merged_with_outcome = merged_with_outcome[ordered_cols]
merged_with_outcome.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,outcome_favourable
0,524931,6535,0,0,0,0,0,First,E,2,0
1,500733,6535,0,0,0,0,0,First,E,2,0
2,499882,6535,0,0,0,0,0,First,E,2,0
3,504568,6535,3,1,0,0,3,First,E,2,1
4,518865,6535,0,0,0,0,0,First,E,2,0


In [28]:
test_raw.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,505701,6548,21-May-06,1,0,0,0,2
1,500633,6584,02-Jun-06,0,0,0,0,0
2,506945,6582,10-Aug-06,0,0,0,0,0
3,497447,6551,27-Aug-06,0,0,0,0,0
4,496446,6533,19-Sep-06,0,0,0,0,0


In [29]:
patient_raw.head()

Unnamed: 0,Patient_ID,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category
0,516956,0,0,0,0,1,90.0,39,18-Jun-03,,Software Industry
1,507733,0,0,0,0,1,,40,20-Jul-03,H,Software Industry
2,508307,0,0,0,0,3,87.0,46,02-Nov-02,D,BFSI
3,512612,0,0,0,0,1,75.0,47,02-Nov-02,D,Education
4,521075,0,0,0,0,3,,80,24-Nov-02,H,Others


# Final datasets

In [30]:
train_data = merged_with_outcome.merge(patient_raw, on='Patient_ID')
print_shape(train_data)
train_data.head()

shape of the df(train_data): (75278, 21)


Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,...,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category
0,524931,6535,0,0,0,0,0,First,E,2,...,0,0,0,0,,,,07-Feb-04,I,
1,524931,6534,0,0,0,0,0,Second,A,2,...,0,0,0,0,,,,07-Feb-04,I,
2,524931,6529,0,0,0,0,0,Second,A,2,...,0,0,0,0,,,,07-Feb-04,I,
3,524931,6523,0,0,0,0,0,Second,D,2,...,0,0,0,0,,,,07-Feb-04,I,
4,524931,6528,0,0,0,0,0,Third,G,2,...,0,0,0,0,,,,07-Feb-04,I,


In [31]:
test_data = merged_test.merge(patient_raw, on='Patient_ID')
print_shape(test_data)
test_data.head()

shape of the df(test_data): (35249, 20)


Unnamed: 0,Patient_ID,Health_Camp_ID,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category
0,505701,6548,1,0,0,0,2,Third,G,2,0,0,0,0,0,,44,05-Feb-03,E,
1,505701,6582,1,0,0,0,2,First,F,2,0,0,0,0,0,,44,05-Feb-03,E,
2,505701,6567,1,0,0,0,2,Second,D,2,0,0,0,0,0,,44,05-Feb-03,E,
3,505701,6566,1,0,0,0,2,First,E,2,0,0,0,0,0,,44,05-Feb-03,E,
4,505701,6556,1,0,0,0,2,First,E,2,0,0,0,0,0,,44,05-Feb-03,E,


# Data pre-processing

In [32]:
# convert target to category
train_data['outcome_favourable'] = train_data['outcome_favourable'].astype('category')
train_data.dtypes

Patient_ID               int64
Health_Camp_ID           int64
Var1                     int64
Var2                     int64
Var3                     int64
Var4                     int64
Var5                     int64
Category1               object
Category2               object
Category3                int64
outcome_favourable    category
Online_Follower          int64
LinkedIn_Shared          int64
Twitter_Shared           int64
Facebook_Shared          int64
Income                  object
Education_Score         object
Age                     object
First_Interaction       object
City_Type               object
Employer_Category       object
dtype: object

In [33]:
train_data.isna().sum()

Patient_ID                0
Health_Camp_ID            0
Var1                      0
Var2                      0
Var3                      0
Var4                      0
Var5                      0
Category1                 0
Category2                 0
Category3                 0
outcome_favourable        0
Online_Follower           0
LinkedIn_Shared           0
Twitter_Shared            0
Facebook_Shared           0
Income                    0
Education_Score           0
Age                       0
First_Interaction         0
City_Type             33208
Employer_Category     60075
dtype: int64

In [34]:
# drop columns City_Type, Employer_Category since we na size is huge
# drop Patient_ID, Health_Camp_ID since they are nominal
# drop First_Interaction dtype datetime and is not important
train_data.drop(['City_Type', 'Employer_Category', 'Patient_ID', 'Health_Camp_ID', 'First_Interaction'], axis=True, inplace=True)
train_data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,outcome_favourable,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age
0,0,0,0,0,0,First,E,2,0,0,0,0,0,,,
1,0,0,0,0,0,Second,A,2,1,0,0,0,0,,,
2,0,0,0,0,0,Second,A,2,0,0,0,0,0,,,
3,0,0,0,0,0,Second,D,2,1,0,0,0,0,,,
4,0,0,0,0,0,Third,G,2,1,0,0,0,0,,,


In [35]:
# repeat the same for test data
test_data.drop(['City_Type', 'Employer_Category', 'Patient_ID', 'Health_Camp_ID', 'First_Interaction'], axis=True, inplace=True)
test_data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Category1,Category2,Category3,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age
0,1,0,0,0,2,Third,G,2,0,0,0,0,0,,44
1,1,0,0,0,2,First,F,2,0,0,0,0,0,,44
2,1,0,0,0,2,Second,D,2,0,0,0,0,0,,44
3,1,0,0,0,2,First,E,2,0,0,0,0,0,,44
4,1,0,0,0,2,First,E,2,0,0,0,0,0,,44


In [36]:
train_data.dtypes

Var1                     int64
Var2                     int64
Var3                     int64
Var4                     int64
Var5                     int64
Category1               object
Category2               object
Category3                int64
outcome_favourable    category
Online_Follower          int64
LinkedIn_Shared          int64
Twitter_Shared           int64
Facebook_Shared          int64
Income                  object
Education_Score         object
Age                     object
dtype: object

In [37]:
# type casting for train and test datasets
cols_to_cat = ['Category1', 'Category2', 'Category3', 'Online_Follower', 'LinkedIn_Shared', 'Twitter_Shared',
              'Facebook_Shared', 'Income', 'Age']
for col in cols_to_cat:
    train_data[col] = train_data[col].astype('category')
    test_data[col] = test_data[col].astype('category')

In [38]:
train_data.dtypes

Var1                     int64
Var2                     int64
Var3                     int64
Var4                     int64
Var5                     int64
Category1             category
Category2             category
Category3             category
outcome_favourable    category
Online_Follower       category
LinkedIn_Shared       category
Twitter_Shared        category
Facebook_Shared       category
Income                category
Education_Score         object
Age                   category
dtype: object

In [39]:
# check no of uniques in Education_Score
train_data.Education_Score.nunique()

228

In [40]:
# replace None with 0 since we assume none = no education
train_data.loc[train_data['Education_Score'] == 'None', 'Education_Score'] = 0
train_data.Education_Score.value_counts()

0              65345
82               468
79               423
75               413
86               390
               ...  
77.625             1
80.73333333        1
78.05              1
66.56666667        1
92.07666667        1
Name: Education_Score, Length: 228, dtype: int64

In [41]:
# we can now convert to Education_Score to float
train_data['Education_Score']=train_data.swifter.allow_dask_on_strings().apply(lambda x: round(float(x['Education_Score']), 2),axis=1)
train_data['Education_Score'].dtypes

Dask Apply: 100%|██████████| 24/24 [00:02<00:00, 11.03it/s]


dtype('float64')

In [42]:
# check no of uniques in Age
train_data.Age.value_counts().nlargest(5)

None    51612
41       1843
40       1760
42       1676
43       1628
Name: Age, dtype: int64

In [43]:
# we have 51k odd None vales. 
# for now, lets drop the age column and comeback to it later
# train_data.drop('Age', axis=1, inplace=True)
# train_data.dtypes

In [44]:
# replace None with 0 since we assume none = no education
test_data.loc[test_data['Education_Score'] == 'None', 'Education_Score'] = 0
test_data.Education_Score.value_counts()

0              32822
82               120
75               105
80               103
79               100
               ...  
69.5225            1
71.93333333        1
67.93333333        1
71.52333333        1
84.4               1
Name: Education_Score, Length: 92, dtype: int64

In [45]:
# we can now convert to Education_Score to float
test_data['Education_Score']=test_data.swifter.allow_dask_on_strings().apply(lambda x: round(float(x['Education_Score']), 2),axis=1)
test_data['Education_Score'].dtypes

Dask Apply: 100%|██████████| 24/24 [00:01<00:00, 14.86it/s]


dtype('float64')

In [46]:
test_data.Age.value_counts().nlargest(5)

None    26313
41        725
40        627
39        565
42        512
Name: Age, dtype: int64

In [47]:
# test_data.drop('Age', axis=1, inplace=True)
# test_data.dtypes

In [48]:
train_data.dtypes

Var1                     int64
Var2                     int64
Var3                     int64
Var4                     int64
Var5                     int64
Category1             category
Category2             category
Category3             category
outcome_favourable    category
Online_Follower       category
LinkedIn_Shared       category
Twitter_Shared        category
Facebook_Shared       category
Income                category
Education_Score        float64
Age                   category
dtype: object

In [49]:
test_data.dtypes

Var1                  int64
Var2                  int64
Var3                  int64
Var4                  int64
Var5                  int64
Category1          category
Category2          category
Category3          category
Online_Follower    category
LinkedIn_Shared    category
Twitter_Shared     category
Facebook_Shared    category
Income             category
Education_Score     float64
Age                category
dtype: object

In [50]:
X = train_data[train_data.columns.difference(['outcome_favourable'])]
X.head()

Unnamed: 0,Age,Category1,Category2,Category3,Education_Score,Facebook_Shared,Income,LinkedIn_Shared,Online_Follower,Twitter_Shared,Var1,Var2,Var3,Var4,Var5
0,,First,E,2,0.0,0,,0,0,0,0,0,0,0,0
1,,Second,A,2,0.0,0,,0,0,0,0,0,0,0,0
2,,Second,A,2,0.0,0,,0,0,0,0,0,0,0,0
3,,Second,D,2,0.0,0,,0,0,0,0,0,0,0,0
4,,Third,G,2,0.0,0,,0,0,0,0,0,0,0,0


In [51]:
y = train_data['outcome_favourable']
y.head()

0    0
1    1
2    0
3    1
4    1
Name: outcome_favourable, dtype: category
Categories (2, int64): [0, 1]

In [52]:
cat_attr = list(X.select_dtypes('category').columns)
num_attr = list(X.columns.difference(cat_attr))

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [54]:
print('shape of X_train:', X_train.shape)
print('shape of X_valid:', X_valid.shape)
print('shape of y_train:', y_train.shape)
print('shape of y_valid:', y_valid.shape)

shape of X_train: (50436, 15)
shape of X_valid: (24842, 15)
shape of y_train: (50436,)
shape of y_valid: (24842,)


In [55]:
y.value_counts(normalize=True)

0    0.727224
1    0.272776
Name: outcome_favourable, dtype: float64

In [82]:
y.value_counts()

0    54744
1    20534
Name: outcome_favourable, dtype: int64

In [56]:
y_train.value_counts(normalize=True)

0    0.72712
1    0.27288
Name: outcome_favourable, dtype: float64

In [57]:
y_valid.value_counts(normalize=True)

0    0.727437
1    0.272563
Name: outcome_favourable, dtype: float64

In [58]:
x_train_num = X_train[num_attr].reset_index()
x_train_num.drop('index', axis=1, inplace=True)
x_train_num.head()

Unnamed: 0,Education_Score,Var1,Var2,Var3,Var4,Var5
0,0.0,0,0,0,0,0
1,0.0,0,0,0,0,0
2,0.0,0,0,0,0,0
3,0.0,0,0,0,0,0
4,0.0,2,0,0,0,0


In [59]:
x_valid_num = X_valid[num_attr].reset_index()
x_valid_num.drop('index', axis=1, inplace=True)
x_valid_num.head()

Unnamed: 0,Education_Score,Var1,Var2,Var3,Var4,Var5
0,0.0,5,1,0,0,3
1,0.0,0,0,0,0,0
2,0.0,0,0,0,0,0
3,0.0,0,0,0,0,0
4,0.0,0,0,0,0,0


In [60]:
x_test_num = test_data[num_attr].reset_index()
x_test_num.drop('index', axis=1, inplace=True)
x_test_num.head()

Unnamed: 0,Education_Score,Var1,Var2,Var3,Var4,Var5
0,0.0,1,0,0,0,2
1,0.0,1,0,0,0,2
2,0.0,1,0,0,0,2
3,0.0,1,0,0,0,2
4,0.0,1,0,0,0,2


In [61]:
x_full_num = X[num_attr].reset_index()
x_full_num.drop('index', axis=1, inplace=True)
x_full_num.head()

Unnamed: 0,Education_Score,Var1,Var2,Var3,Var4,Var5
0,0.0,0,0,0,0,0
1,0.0,0,0,0,0,0
2,0.0,0,0,0,0,0
3,0.0,0,0,0,0,0
4,0.0,0,0,0,0,0


In [62]:
from sklearn.preprocessing import OneHotEncoder
## LabelEncoder
onehotencoder = OneHotEncoder(handle_unknown='ignore')
## Fit method
# onehotencoder = onehotencoder.fit(X_train[cat_attr])
onehotencoder = onehotencoder.fit(X[cat_attr])
## Get names for new columns
ohe_cat_col_names = onehotencoder.get_feature_names(cat_attr)
## Print the above columns names
ohe_cat_col_names

array(['Age_31', 'Age_32', 'Age_33', 'Age_34', 'Age_35', 'Age_36',
       'Age_37', 'Age_38', 'Age_39', 'Age_40', 'Age_41', 'Age_42',
       'Age_43', 'Age_44', 'Age_45', 'Age_46', 'Age_47', 'Age_48',
       'Age_49', 'Age_50', 'Age_51', 'Age_52', 'Age_53', 'Age_54',
       'Age_55', 'Age_56', 'Age_57', 'Age_58', 'Age_59', 'Age_60',
       'Age_61', 'Age_62', 'Age_63', 'Age_64', 'Age_65', 'Age_66',
       'Age_67', 'Age_68', 'Age_69', 'Age_70', 'Age_71', 'Age_72',
       'Age_73', 'Age_74', 'Age_75', 'Age_76', 'Age_77', 'Age_78',
       'Age_80', 'Age_None', 'Category1_First', 'Category1_Second',
       'Category1_Third', 'Category2_A', 'Category2_B', 'Category2_C',
       'Category2_D', 'Category2_E', 'Category2_F', 'Category2_G',
       'Category3_1', 'Category3_2', 'Facebook_Shared_0',
       'Facebook_Shared_1', 'Income_0', 'Income_1', 'Income_2',
       'Income_3', 'Income_4', 'Income_5', 'Income_6', 'Income_None',
       'LinkedIn_Shared_0', 'LinkedIn_Shared_1', 'Online_Follower_

In [63]:
## Encode x_train category columns 
x_train_cat_ohe = onehotencoder.transform(X_train[cat_attr]).toarray()
x_train_cat_ohe = pd.DataFrame(x_train_cat_ohe, columns=ohe_cat_col_names)
print_shape(x_train_cat_ohe)
x_train_cat_ohe.head()

shape of the df(x_train_cat_ohe): (50436, 78)


Unnamed: 0,Age_31,Age_32,Age_33,Age_34,Age_35,Age_36,Age_37,Age_38,Age_39,Age_40,...,Income_4,Income_5,Income_6,Income_None,LinkedIn_Shared_0,LinkedIn_Shared_1,Online_Follower_0,Online_Follower_1,Twitter_Shared_0,Twitter_Shared_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [64]:
## Encode y_valid category columns 
x_valid_cat_ohe = onehotencoder.transform(X_valid[cat_attr]).toarray()
x_valid_cat_ohe = pd.DataFrame(x_valid_cat_ohe, columns=ohe_cat_col_names)
print_shape(x_valid_cat_ohe)
x_valid_cat_ohe.head()

shape of the df(x_valid_cat_ohe): (24842, 78)


Unnamed: 0,Age_31,Age_32,Age_33,Age_34,Age_35,Age_36,Age_37,Age_38,Age_39,Age_40,...,Income_4,Income_5,Income_6,Income_None,LinkedIn_Shared_0,LinkedIn_Shared_1,Online_Follower_0,Online_Follower_1,Twitter_Shared_0,Twitter_Shared_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [65]:
## Encode test category columns 
x_test_cat_ohe = onehotencoder.transform(test_data[cat_attr]).toarray()
x_test_cat_ohe = pd.DataFrame(x_test_cat_ohe, columns=ohe_cat_col_names)
print_shape(x_test_cat_ohe)
x_test_cat_ohe.head()

shape of the df(x_test_cat_ohe): (35249, 78)


Unnamed: 0,Age_31,Age_32,Age_33,Age_34,Age_35,Age_36,Age_37,Age_38,Age_39,Age_40,...,Income_4,Income_5,Income_6,Income_None,LinkedIn_Shared_0,LinkedIn_Shared_1,Online_Follower_0,Online_Follower_1,Twitter_Shared_0,Twitter_Shared_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [66]:
## Encode full train category columns 
x_full_cat_ohe = onehotencoder.transform(X[cat_attr]).toarray()
x_full_cat_ohe = pd.DataFrame(x_full_cat_ohe, columns=ohe_cat_col_names)
print_shape(x_full_cat_ohe)
x_full_cat_ohe.head()

shape of the df(x_full_cat_ohe): (75278, 78)


Unnamed: 0,Age_31,Age_32,Age_33,Age_34,Age_35,Age_36,Age_37,Age_38,Age_39,Age_40,...,Income_4,Income_5,Income_6,Income_None,LinkedIn_Shared_0,LinkedIn_Shared_1,Online_Follower_0,Online_Follower_1,Twitter_Shared_0,Twitter_Shared_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [67]:
X_train = pd.concat([x_train_cat_ohe, x_train_num], axis=1)
X_valid = pd.concat([x_valid_cat_ohe, x_valid_num], axis=1)
X_test = pd.concat([x_test_num, x_test_cat_ohe], axis=1)
X_train_full = pd.concat([x_full_num, x_full_cat_ohe], axis=1)
print_shape(X_train)
print_shape(X_valid)
print_shape(X_test)
print_shape(X_train_full)

shape of the df(X_train): (50436, 84)
shape of the df(X_valid): (24842, 84)
shape of the df(X_test): (35249, 84)
shape of the df(X_train_full): (75278, 84)


In [68]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_valid_scaled = pd.DataFrame(scaler.transform(X_valid), columns=X_valid.columns, index=X_valid.index)


# Model building

### Model: plain vanilla NB on scaled data

In [69]:
from sklearn.naive_bayes import GaussianNB
# model object
nb = GaussianNB()
# fit on train
nb.fit(X_train_scaled, y_train)
# predict on train
y_train_pred = nb.predict(X_train_scaled)
# predict on test
y_valid_pred = nb.predict(X_valid_scaled)
# print report
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# print rocauc score - train
y_train_proba = nb.predict_proba(X_train_scaled)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = nb.predict_proba(X_valid_scaled)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.84      0.70      0.76     36673
           1       0.45      0.65      0.53     13763

    accuracy                           0.69     50436
   macro avg       0.64      0.67      0.65     50436
weighted avg       0.73      0.69      0.70     50436


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.84      0.70      0.76     18071
           1       0.45      0.65      0.53      6771

    accuracy                           0.68     24842
   macro avg       0.64      0.67      0.65     24842
weighted avg       0.73      0.68      0.70     24842

roc_auc_score for train: 0.742223872427412
roc_auc_score for valid: 0.7401369265478139


### Model: plain vanilla NB

In [70]:
from sklearn.naive_bayes import GaussianNB
# model object
nb = GaussianNB()
# fit on train
nb.fit(X_train, y_train)
# predict on train
y_train_pred = nb.predict(X_train)
# predict on test
y_valid_pred = nb.predict(X_valid)
# print report
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# print rocauc score - train
y_train_proba = nb.predict_proba(X_train)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = nb.predict_proba(X_valid)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.78      0.81      0.80     36673
           1       0.44      0.38      0.41     13763

    accuracy                           0.70     50436
   macro avg       0.61      0.60      0.60     50436
weighted avg       0.69      0.70      0.69     50436


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.78      0.81      0.80     18071
           1       0.44      0.39      0.42      6771

    accuracy                           0.70     24842
   macro avg       0.61      0.60      0.61     24842
weighted avg       0.69      0.70      0.69     24842

roc_auc_score for train: 0.7422194839468184
roc_auc_score for valid: 0.7401297754444859


### Plain vanilla Logistic Regression

In [71]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1500)
log_reg.fit(X_train, y_train)
y_train_pred = log_reg.predict(X_train)
y_valid_pred = log_reg.predict(X_valid)
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# keeping non_scaled since there wasn't big diff
# print rocauc score - train
y_train_proba = log_reg.predict_proba(X_train)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = log_reg.predict_proba(X_valid)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     36673
           1       0.60      0.51      0.55     13763

    accuracy                           0.77     50436
   macro avg       0.71      0.69      0.70     50436
weighted avg       0.76      0.77      0.77     50436


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     18071
           1       0.60      0.52      0.56      6771

    accuracy                           0.77     24842
   macro avg       0.71      0.69      0.70     24842
weighted avg       0.77      0.77      0.77     24842

roc_auc_score for train: 0.8063125911874012
roc_auc_score for valid: 0.8063433449352015


### Plain vanilla Logistic Regression with scaled data

In [72]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1500)
log_reg.fit(X_train_scaled, y_train)
y_train_pred = log_reg.predict(X_train_scaled)
y_valid_pred = log_reg.predict(X_valid_scaled)
# print classification score report
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# print rocauc score - train
y_train_proba = log_reg.predict_proba(X_train_scaled)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = log_reg.predict_proba(X_valid_scaled)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     36673
           1       0.60      0.51      0.55     13763

    accuracy                           0.77     50436
   macro avg       0.71      0.69      0.70     50436
weighted avg       0.76      0.77      0.77     50436


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     18071
           1       0.60      0.52      0.56      6771

    accuracy                           0.77     24842
   macro avg       0.71      0.69      0.70     24842
weighted avg       0.77      0.77      0.77     24842

roc_auc_score for train: 0.8064543183074023
roc_auc_score for valid: 0.8060497533233036


### RandomForest plain vanilla

In [73]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, oob_score=True)
random_forest.fit(X_train, y_train)
y_train_pred = random_forest.predict(X_train)
y_valid_pred = random_forest.predict(X_valid)
# print classification score report
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# print rocauc score - train
y_train_proba = random_forest.predict_proba(X_train_scaled)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = random_forest.predict_proba(X_valid_scaled)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')



               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.86      0.91      0.88     36673
           1       0.71      0.60      0.65     13763

    accuracy                           0.82     50436
   macro avg       0.78      0.75      0.77     50436
weighted avg       0.82      0.82      0.82     50436


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.82      0.87      0.84     18071
           1       0.59      0.50      0.54      6771

    accuracy                           0.77     24842
   macro avg       0.70      0.68      0.69     24842
weighted avg       0.76      0.77      0.76     24842

roc_auc_score for train: 0.8103320491040903
roc_auc_score for valid: 0.7955992412507742


### RandomForest plain vanilla with GridCV

In [74]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

In [75]:
grid_rf = {'criterion':['gini', 'entropy'], 'max_depth': [16, 18, 20], 'min_samples_split': [4], 
           'class_weight':['balanced'], 'min_impurity_decrease':[0.002]}
rf_model = RandomForestClassifier(n_estimators=300)
gs_obj_rf = GridSearchCV(estimator=rf_model, param_grid=grid_rf, cv = kfold, n_jobs=-1, verbose=5)
gs_obj_rf.fit(X_train_scaled, y_train)
y_train_pred = gs_obj_rf.predict(X_train_scaled)
y_valid_pred = gs_obj_rf.predict(X_valid_scaled)
# print classification score report
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# print rocauc score - train
y_train_proba = gs_obj_rf.predict_proba(X_train_scaled)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = gs_obj_rf.predict_proba(X_valid_scaled)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  30 | elapsed:   14.7s remaining:   16.8s
[Parallel(n_jobs=-1)]: Done  21 out of  30 | elapsed:   15.6s remaining:    6.7s
[Parallel(n_jobs=-1)]: Done  28 out of  30 | elapsed:   19.3s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   19.3s finished



               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.87      0.80      0.83     36673
           1       0.56      0.70      0.62     13763

    accuracy                           0.77     50436
   macro avg       0.72      0.75      0.73     50436
weighted avg       0.79      0.77      0.78     50436


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.88      0.80      0.83     18071
           1       0.56      0.70      0.63      6771

    accuracy                           0.77     24842
   macro avg       0.72      0.75      0.73     24842
weighted avg       0.79      0.77      0.78     24842

roc_auc_score for train: 0.7986122808084953
roc_auc_score for valid: 0.795469532495435


In [76]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

model = BaggingClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores_train = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
scores_valid = cross_val_score(model, X_valid, y_valid, scoring='roc_auc', cv=cv, n_jobs=-1)


# summarize performance
print('Mean ROC AUC: %.3f' % np.mean(scores_train))
print('Mean ROC AUC: %.3f' % np.mean(scores_valid))

Mean ROC AUC: 0.788
Mean ROC AUC: 0.777


In [77]:
gs_obj_rf.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=18,
                       min_impurity_decrease=0.002, min_samples_split=4,
                       n_estimators=300)

In [78]:
grid_rf = {'criterion':['gini', 'entropy'], 'max_depth': [16, 18, 20], 'min_samples_split': [4], 
           'class_weight':['balanced'], 'min_impurity_decrease':[0.002]}
rf_model = RandomForestClassifier(n_estimators=300)
gs_obj_rf = GridSearchCV(estimator=rf_model, param_grid=grid_rf, cv = kfold, n_jobs=-1, verbose=5)
gs_obj_rf.fit(X_train_full, y)

y_test_proba = gs_obj_rf.predict_proba(X_test)[:,1]
prediction = pd.DataFrame(y_test_proba, columns=['Outcome'])
patient_id = pd.DataFrame(test_raw['Patient_ID'], columns=['Patient_ID'])
health_camp_id = pd.DataFrame(test_raw['Health_Camp_ID'], columns=['Health_Camp_ID'])
output = pd.concat([patient_id, health_camp_id, prediction], axis=1).to_csv('~/Downloads/VA_Data/output/prediction_5.csv', index=None)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  30 | elapsed:   19.7s remaining:   22.5s
[Parallel(n_jobs=-1)]: Done  21 out of  30 | elapsed:   21.4s remaining:    9.2s
[Parallel(n_jobs=-1)]: Done  28 out of  30 | elapsed:   26.1s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   26.3s finished


In [79]:
log_reg = LogisticRegression(max_iter=1500)
log_reg.fit(X_train_full, y)

y_test_proba = log_reg.predict_proba(X_test)[:,1]
prediction = pd.DataFrame(y_test_proba, columns=['Outcome'])
patient_id = pd.DataFrame(test_raw['Patient_ID'], columns=['Patient_ID'])
health_camp_id = pd.DataFrame(test_raw['Health_Camp_ID'], columns=['Health_Camp_ID'])
output = pd.concat([patient_id, health_camp_id, prediction], axis=1).to_csv('~/Downloads/VA_Data/output/prediction_6.csv', index=None)

In [80]:
# grid_rf = {'criterion':['entropy', 'gini'], 'max_depth': [40, 45, 50], 'min_samples_split': [8, 10, 15], 
#            'warm_start': [True], 'min_impurity_decrease':[0.0002, 0.0001]}
# rf_model = RandomForestClassifier(n_estimators=100)
# gs_obj_rf = GridSearchCV(estimator=rf_model, param_grid=grid_rf, cv = kfold, n_jobs=-1, verbose=5)

# gs_obj_rf.fit(X_train_full, y)
# y_test_proba = gs_obj_rf.predict_proba(X_test)[:,1]
# prediction = pd.DataFrame(y_test_proba, columns=['Outcome'])
# patient_id = pd.DataFrame(test_raw['Patient_ID'], columns=['Patient_ID'])
# health_camp_id = pd.DataFrame(test_raw['Health_Camp_ID'], columns=['Health_Camp_ID'])
# output = pd.concat([patient_id, health_camp_id, prediction], axis=1).to_csv('~/Downloads/VA_Data/output/prediction_4.csv', index=None)

### 1. Up-sample Minority Class


In [86]:
from sklearn.utils import resample
df_majority = train_data[train_data.outcome_favourable==0]
df_minority = train_data[train_data.outcome_favourable==1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=54744,    # to match majority class
                                 random_state=123) # reproducible results
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.outcome_favourable.value_counts()

1    54744
0    54744
Name: outcome_favourable, dtype: int64

In [87]:
y = df_upsampled.outcome_favourable
X = df_upsampled.drop('outcome_favourable', axis=1)

In [88]:
X.dtypes

Var1                  int64
Var2                  int64
Var3                  int64
Var4                  int64
Var5                  int64
Category1          category
Category2          category
Category3          category
Online_Follower    category
LinkedIn_Shared    category
Twitter_Shared     category
Facebook_Shared    category
Income             category
Education_Score     float64
Age                category
dtype: object

In [89]:
cat_attr = list(X.select_dtypes('category').columns)
num_attr = list(X.columns.difference(cat_attr))
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
x_train_num = X_train[num_attr].reset_index()
x_train_num.drop('index', axis=1, inplace=True)
x_train_num.head()
x_valid_num = X_valid[num_attr].reset_index()
x_valid_num.drop('index', axis=1, inplace=True)
x_valid_num.head()

Unnamed: 0,Education_Score,Var1,Var2,Var3,Var4,Var5
0,0.0,0,0,0,0,0
1,79.0,5,0,0,0,3
2,0.0,0,0,0,0,0
3,0.0,0,0,0,0,0
4,83.0,0,0,0,0,0


In [91]:
from sklearn.preprocessing import OneHotEncoder
## LabelEncoder
onehotencoder = OneHotEncoder(handle_unknown='ignore')
## Fit method
# onehotencoder = onehotencoder.fit(X_train[cat_attr])
onehotencoder = onehotencoder.fit(X_train[cat_attr])
## Get names for new columns
ohe_cat_col_names = onehotencoder.get_feature_names(cat_attr)
## Print the above columns names
ohe_cat_col_names

array(['Category1_First', 'Category1_Second', 'Category1_Third',
       'Category2_A', 'Category2_B', 'Category2_C', 'Category2_D',
       'Category2_E', 'Category2_F', 'Category2_G', 'Category3_1',
       'Category3_2', 'Online_Follower_0', 'Online_Follower_1',
       'LinkedIn_Shared_0', 'LinkedIn_Shared_1', 'Twitter_Shared_0',
       'Twitter_Shared_1', 'Facebook_Shared_0', 'Facebook_Shared_1',
       'Income_0', 'Income_1', 'Income_2', 'Income_3', 'Income_4',
       'Income_5', 'Income_6', 'Income_None', 'Age_31', 'Age_32',
       'Age_33', 'Age_34', 'Age_35', 'Age_36', 'Age_37', 'Age_38',
       'Age_39', 'Age_40', 'Age_41', 'Age_42', 'Age_43', 'Age_44',
       'Age_45', 'Age_46', 'Age_47', 'Age_48', 'Age_49', 'Age_50',
       'Age_51', 'Age_52', 'Age_53', 'Age_54', 'Age_55', 'Age_56',
       'Age_57', 'Age_58', 'Age_59', 'Age_60', 'Age_61', 'Age_62',
       'Age_63', 'Age_64', 'Age_65', 'Age_66', 'Age_67', 'Age_68',
       'Age_69', 'Age_70', 'Age_71', 'Age_72', 'Age_73', 'Age_74

In [92]:
## Encode x_train category columns 
x_train_cat_ohe = onehotencoder.transform(X_train[cat_attr]).toarray()
x_train_cat_ohe = pd.DataFrame(x_train_cat_ohe, columns=ohe_cat_col_names)
print_shape(x_train_cat_ohe)
x_train_cat_ohe.head()

shape of the df(x_train_cat_ohe): (87590, 78)


Unnamed: 0,Category1_First,Category1_Second,Category1_Third,Category2_A,Category2_B,Category2_C,Category2_D,Category2_E,Category2_F,Category2_G,...,Age_71,Age_72,Age_73,Age_74,Age_75,Age_76,Age_77,Age_78,Age_80,Age_None
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
## Encode x_train category columns 
x_valid_cat_ohe = onehotencoder.transform(X_valid[cat_attr]).toarray()
x_valid_cat_ohe = pd.DataFrame(x_valid_cat_ohe, columns=ohe_cat_col_names)
print_shape(x_valid_cat_ohe)
x_valid_cat_ohe.head()

shape of the df(x_valid_cat_ohe): (21898, 78)


Unnamed: 0,Category1_First,Category1_Second,Category1_Third,Category2_A,Category2_B,Category2_C,Category2_D,Category2_E,Category2_F,Category2_G,...,Age_71,Age_72,Age_73,Age_74,Age_75,Age_76,Age_77,Age_78,Age_80,Age_None
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
X_train = pd.concat([x_train_cat_ohe, x_train_num], axis=1)
X_valid = pd.concat([x_valid_cat_ohe, x_valid_num], axis=1)
print_shape(X_train)
print_shape(X_valid)

shape of the df(X_train): (87590, 84)
shape of the df(X_valid): (21898, 84)


In [95]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1500)
log_reg.fit(X_train, y_train)
y_train_pred = log_reg.predict(X_train)
y_valid_pred = log_reg.predict(X_valid)
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# keeping non_scaled since there wasn't big diff
# print rocauc score - train
y_train_proba = log_reg.predict_proba(X_train)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = log_reg.predict_proba(X_valid)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.76      0.77      0.76     43665
           1       0.76      0.75      0.76     43925

    accuracy                           0.76     87590
   macro avg       0.76      0.76      0.76     87590
weighted avg       0.76      0.76      0.76     87590


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.76      0.76      0.76     11079
           1       0.75      0.75      0.75     10819

    accuracy                           0.75     21898
   macro avg       0.75      0.75      0.75     21898
weighted avg       0.75      0.75      0.75     21898

roc_auc_score for train: 0.8073016687238385
roc_auc_score for valid: 0.8036496970838568


In [96]:
grid_rf = {'criterion':['gini', 'entropy'], 'max_depth': [16, 18, 20], 'min_samples_split': [4], 
           'class_weight':['balanced'], 'min_impurity_decrease':[0.002]}
rf_model = RandomForestClassifier(n_estimators=300)
gs_obj_rf = GridSearchCV(estimator=rf_model, param_grid=grid_rf, cv = kfold, n_jobs=-1, verbose=5)
gs_obj_rf.fit(X_train, y_train)
y_train_pred = gs_obj_rf.predict(X_train)
y_valid_pred = gs_obj_rf.predict(X_valid)
# print classification score report
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# print rocauc score - train
y_train_proba = gs_obj_rf.predict_proba(X_train)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = gs_obj_rf.predict_proba(X_valid)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  30 | elapsed:   31.1s remaining:   35.5s
[Parallel(n_jobs=-1)]: Done  21 out of  30 | elapsed:   33.7s remaining:   14.4s
[Parallel(n_jobs=-1)]: Done  28 out of  30 | elapsed:   41.0s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   41.0s finished



               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.73      0.80      0.76     43665
           1       0.78      0.70      0.74     43925

    accuracy                           0.75     87590
   macro avg       0.75      0.75      0.75     87590
weighted avg       0.75      0.75      0.75     87590


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.73      0.80      0.76     11079
           1       0.77      0.70      0.73     10819

    accuracy                           0.75     21898
   macro avg       0.75      0.75      0.75     21898
weighted avg       0.75      0.75      0.75     21898

roc_auc_score for train: 0.8060904784128604
roc_auc_score for valid: 0.8025595171635822


In [98]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=100
                         , learning_rate=1)
boosting_model = abc.fit(X_train, y_train)

y_train_pred = boosting_model.predict(X_train)
y_valid_pred = boosting_model.predict(X_valid)
# print classification score report
classification_report_train_test(y_train, y_train_pred, y_valid, y_valid_pred)
# print rocauc score - train
y_train_proba = boosting_model.predict_proba(X_train)
y_train_proba = [p[1] for p in y_train_proba]
print(f'roc_auc_score for train: {roc_auc_score(y_train, y_train_proba)}')
# print rocauc score - validation
y_valid_proba = boosting_model.predict_proba(X_valid)
y_valid_proba = [p[1] for p in y_valid_proba]
print(f'roc_auc_score for valid: {roc_auc_score(y_valid, y_valid_proba)}')


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.75      0.77      0.76     43665
           1       0.77      0.75      0.76     43925

    accuracy                           0.76     87590
   macro avg       0.76      0.76      0.76     87590
weighted avg       0.76      0.76      0.76     87590


            CLASSIFICATION REPORT FOR VALIDATION DATA
            
              precision    recall  f1-score   support

           0       0.75      0.76      0.76     11079
           1       0.75      0.74      0.75     10819

    accuracy                           0.75     21898
   macro avg       0.75      0.75      0.75     21898
weighted avg       0.75      0.75      0.75     21898

roc_auc_score for train: 0.8082575778579096
roc_auc_score for valid: 0.8045023196805845


In [101]:
abc = AdaBoostClassifier(n_estimators=100, learning_rate=1)
boosting_model = abc.fit(X_train_full, y)

y_test_proba = boosting_model.predict_proba(X_test)[:,1]
prediction = pd.DataFrame(y_test_proba, columns=['Outcome'])
patient_id = pd.DataFrame(test_raw['Patient_ID'], columns=['Patient_ID'])
health_camp_id = pd.DataFrame(test_raw['Health_Camp_ID'], columns=['Health_Camp_ID'])
output = pd.concat([patient_id, health_camp_id, prediction], axis=1).to_csv('~/Downloads/VA_Data/output/prediction_7.csv', index=None)

ValueError: Found input variables with inconsistent numbers of samples: [75278, 109488]