In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *


## Data Colllection

In [2]:
train = pd.read_csv('Train/Train.csv')
test = pd.read_csv('test_l0Auv8Q.csv')
health_camp = pd.read_csv('Train/Health_Camp_Detail.csv')
camp_1 = pd.read_csv('Train/First_Health_Camp_Attended.csv')
camp_2 = pd.read_csv('Train/Second_Health_Camp_Attended.csv')
camp_3 = pd.read_csv('Train/Third_Health_Camp_Attended.csv')
patient_profile = pd.read_csv('Train/Patient_Profile.csv')

In [3]:
camp_1.isnull().sum()

Patient_ID           0
Health_Camp_ID       0
Donation             0
Health_Score         0
Unnamed: 4        6218
dtype: int64

In [4]:
train = pd.merge(train, patient_profile, on = 'Patient_ID', how = 'left')
test = pd.merge(test, patient_profile, on = 'Patient_ID', how = 'left')

In [5]:
#Concat the health camp id and the patient id to get unique ids for target 
for c in [camp_1, camp_2, camp_3, train]:
  c['id'] = c['Patient_ID'].astype('str') + c['Health_Camp_ID'].astype('str')

# traget variable for camp 3 - visiting a stall
camp_3 = camp_3[camp_3['Number_of_stall_visited'] > 0]

# make new dataframe to contain the postive target 
all_patients_in_camp = pd.Series(camp_1['id'].tolist() + camp_2['id'].tolist() + camp_3['id'].tolist()).unique()

# get the target variable - initialise target variable to 0
train['target'] = 0
# whenever the id - concat of health id and patient id - is in the all patient list( positive outcome ), make target as 1
train.loc[train['id'].isin(all_patients_in_camp), 'target'] = 1

In [6]:
train = pd.merge(train, health_camp, on = 'Health_Camp_ID', how = 'left')
test = pd.merge(test, health_camp, on = 'Health_Camp_ID', how = 'left')

## Feature Engineering 

In [7]:
train['Category2'].nunique()

7

In [8]:
#converting None to np.nan
train[['Income', 'Education_Score', 'Age']] = train[['Income', 'Education_Score', 'Age']].apply(lambda x: x.str.replace('None', 'NaN').astype('float'))

In [9]:
test[['Income', 'Education_Score', 'Age']] = test[['Income', 'Education_Score', 'Age']].apply(lambda x: x.str.replace('None', 'NaN').astype('float'))

In [10]:
train[['City_Type','Employer_Category']]=train[['City_Type','Employer_Category']].apply(lambda x: pd.factorize(x)[0])
test[['City_Type','Employer_Category']]=test[['City_Type','Employer_Category']].apply(lambda x: pd.factorize(x)[0])

In [11]:
from sklearn.preprocessing import LabelEncoder
train['cat1_cat2'] = train['Category1'].astype(str) + '_' + train['Category2'].astype(str)
test['cat1_cat2'] = test['Category1'].astype(str) + '_' + test['Category2'].astype(str)

le = LabelEncoder()
train['cat1_cat2'] = le.fit_transform(train['cat1_cat2'])
test['cat1_cat2'] = le.transform(test['cat1_cat2'])

In [12]:


def getCountVar(compute_df, count_df, var_name, count_var="v1"):
    grouped_df = count_df.groupby(var_name, as_index=False).agg('size').reset_index()
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["var_count"])

for col in ["Patient_ID", "Health_Camp_ID"]:
    print("Count : ", col)
    full_df = pd.concat([train, test])
    train["Count_"+col] = getCountVar(train, full_df, col)
    test["Count_"+col] = getCountVar(test, full_df, col)



Count :  Patient_ID
Count :  Health_Camp_ID


In [13]:
for df_tmp in [train, test]:
  for c in ['Health_Camp_ID']:
    # mapper = train
    df_tmp[c + '_freq'] = df_tmp[c].map(df_tmp[c].value_counts(normalize=True))

In [14]:
cat1_col=pd.get_dummies(train['Category1'],drop_first=True)
train=pd.concat([train,cat1_col],axis=1)
train.drop(['Category1'],axis=1,inplace=True)

In [15]:
cat1_col=pd.get_dummies(test['Category1'],drop_first=True)
test=pd.concat([test,cat1_col],axis=1)
test.drop(['Category1'],axis=1,inplace=True)

In [16]:
cat2_col=pd.get_dummies(train['Category2'],drop_first=True)
train=pd.concat([train,cat2_col],axis=1)
train.drop(['Category2'],axis=1,inplace=True)

In [17]:
cat2_col=pd.get_dummies(test['Category2'],drop_first=True)
test=pd.concat([test,cat2_col],axis=1)
test.drop(['Category2'],axis=1,inplace=True)

In [18]:
city_type_col=pd.get_dummies(test['City_Type'],drop_first=True)
test=pd.concat([test,city_type_col],axis=1)
test.drop(['City_Type'],axis=1,inplace=True)

In [19]:
city_type_col=pd.get_dummies(train['City_Type'],drop_first=True)
train=pd.concat([train,city_type_col],axis=1)
train.drop(['City_Type'],axis=1,inplace=True)

In [20]:
emp_cat_col=pd.get_dummies(train['Employer_Category'],drop_first=True)
train=pd.concat([train,emp_cat_col],axis=1)
train.drop(['Employer_Category'],axis=1,inplace=True)

In [21]:
test.columns

Index([          'Patient_ID',       'Health_Camp_ID',    'Registration_Date',
                       'Var1',                 'Var2',                 'Var3',
                       'Var4',                 'Var5',      'Online_Follower',
            'LinkedIn_Shared',       'Twitter_Shared',      'Facebook_Shared',
                     'Income',      'Education_Score',                  'Age',
          'First_Interaction',    'Employer_Category',      'Camp_Start_Date',
              'Camp_End_Date',            'Category3',            'cat1_cat2',
           'Count_Patient_ID', 'Count_Health_Camp_ID',  'Health_Camp_ID_freq',
                     'Second',                'Third',                    'C',
                          'D',                    'E',                    'F',
                          'G',                      0,                      1,
                            2,                      3,                      4,
                            5,                      

In [22]:
emp_cat_col=pd.get_dummies(test['Employer_Category'],drop_first=True)
test=pd.concat([test,emp_cat_col],axis=1)
test.drop(['Employer_Category'],axis=1,inplace=True)

In [23]:
import datetime as dt

D_COL = 'Registration_Date'
for df_tmp in [train, test]:
  df_tmp[D_COL] = pd.to_datetime(df_tmp[D_COL])


train["First_Interaction"] = pd.to_datetime(train["First_Interaction"],dayfirst=True)
train["Days_Before_Interact"] = train["Registration_Date"] - train["First_Interaction"]
train["Days_Before_Interact"] = train["Days_Before_Interact"].dt.days
train["Days_Before_Interact"] = train["Days_Before_Interact"].fillna(99999).astype(int)

test["First_Interaction"] = pd.to_datetime(test["First_Interaction"],dayfirst=True)
test["Days_Before_Interact"] = test["Registration_Date"] - train["First_Interaction"]
test["Days_Before_Interact"] = test["Days_Before_Interact"].dt.days
test["Days_Before_Interact"] = test["Days_Before_Interact"].fillna(99999).astype(int)

train['Camp_Start_Date'] = pd.to_datetime(train['Camp_Start_Date'])
train['Camp_End_Date'] = pd.to_datetime(train['Camp_End_Date'])
train['total_days_of_campaign'] = (train['Camp_End_Date'] - train['Camp_Start_Date']).dt.days

test['Camp_Start_Date'] = pd.to_datetime(test['Camp_Start_Date'])
test['Camp_End_Date'] = pd.to_datetime(test['Camp_End_Date'])
test['total_days_of_campaign'] = (test['Camp_End_Date'] - test['Camp_Start_Date']).dt.days

train['reg_start_diff'] = (train['Camp_Start_Date'] - train['Registration_Date']).dt.days
test['reg_start_diff'] = (test['Camp_Start_Date'] - test['Registration_Date']).dt.days

train['days_for_camp_end'] = (train['Registration_Date'] - train['Camp_End_Date']).dt.days
test['days_for_camp_end'] = (test['Registration_Date'] - test['Camp_End_Date']).dt.days

train['point_in_camp'] = 1- train['days_for_camp_end']/train['total_days_of_campaign']
test['point_in_camp'] = 1- test['days_for_camp_end']/test['total_days_of_campaign']

train["Diff_CampStart_FirstInteraction"] = np.abs(train["Camp_Start_Date"] - train["First_Interaction"]).dt.days
test["Diff_CampStart_FirstInteraction"] = np.abs(test["Camp_Start_Date"] - test["First_Interaction"]).dt.days

In [24]:
test['B']=0

In [25]:
def impute_na(df, variable):

    # extract the random sample to fill the na
    random_sample = df[variable].dropna().sample(df[variable].isnull().sum(), random_state=0,replace=True)
    # pandas needs to have the same index in order to merge datasets
    random_sample.index = df[df[variable].isnull()].index
    df.loc[df[variable].isnull(), variable] = random_sample

In [26]:
impute_na(train,'Education_Score')
impute_na(train,'Age')
impute_na(test,'Education_Score')
impute_na(test,'Age')

In [27]:
test_min_date = test[D_COL].min()
### Getting a train and validation split, similar to test data

trn = train[train[D_COL] < test_min_date]
val = train[train[D_COL] >= test_min_date]

In [28]:
TARGET_COL = 'target'
features = [c for c in trn.columns if c not in ['Patient_ID', 'Health_Camp_ID', 'Registration_Date', TARGET_COL, 'id','Camp_Start_Date', 'Camp_End_Date', 'First_Interaction']]
len(features)

54

In [29]:
trn.fillna(-999,inplace=True)
val.fillna(-999,inplace=True)
test.fillna(-999,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [30]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(n_estimators=100, learning_rate=0.04, random_state=27)

clf.fit(trn[features], trn[TARGET_COL], eval_set=[(val[features], val[TARGET_COL])], verbose=50,
        eval_metric = 'auc', early_stopping_rounds = 10000)

preds = clf.predict_proba(test[features])[:, 1]

Training until validation scores don't improve for 10000 rounds
[50]	valid_0's auc: 0.824866	valid_0's binary_logloss: 0.487503
[100]	valid_0's auc: 0.824305	valid_0's binary_logloss: 0.485817
Did not meet early stopping. Best iteration is:
[60]	valid_0's auc: 0.825227	valid_0's binary_logloss: 0.485629


In [31]:
ss = pd.read_csv('sample_submmission.csv')
ss['Outcome'] = preds
SUB_FILE_NAME = 'submission_jantahack_healthcare.csv'
ss.to_csv(SUB_FILE_NAME, index=False)


In [33]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression(max_iter=4000,fit_intercept=True,C=0.7,class_weight={1:0.727224,0:0.272776})
lr.fit(trn[features], trn[TARGET_COL])


LogisticRegression(C=0.7, class_weight={0: 0.272776, 1: 0.727224},
                   max_iter=4000)

In [34]:
preds1 = lr.predict_proba(test[features])[:, 1]

In [36]:
sub = pd.read_csv('sample_submmission.csv')
SUB_FILE_NAME = 'submission_jantahack_healthcare_final.csv'
sub["Outcome"] = preds1*0.3 + preds*0.8
sub.to_csv(SUB_FILE_NAME, index=False)
