In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Importing Dataset

In [2]:
train = pd.read_csv("train.csv")
submission = pd.read_csv("sample_submmission.csv")
test = pd.read_csv("test.csv")

patient_info = pd.read_csv("Patient_Profile.csv")
camp_info = pd.read_csv("Health_Camp_Detail.csv")
first = pd.read_csv("First_Health_Camp_Attended.csv")
second = pd.read_csv("Second_Health_Camp_Attended.csv")
third = pd.read_csv("Third_Health_Camp_Attended.csv")

# Preparing train and test data

In [3]:
first.drop(['Unnamed: 4', 'Donation'], axis=1, inplace=True)
second.rename(columns = {'Health Score':'Health_Score'}, inplace = True)
third['Health_Score'] = third['Number_of_stall_visited']/7.0
third.drop(['Number_of_stall_visited', 'Last_Stall_Visited_Number'], axis=1, inplace=True)

combined_attended_camps = pd.concat([first, second, third], sort=False, ignore_index=True)

#-------------------------------------------

train_merged_data = pd.merge(train, patient_info, on='Patient_ID', how='left')
train_merged_data_2 = pd.merge(train_merged_data, camp_info, on='Health_Camp_ID', how='left')

test_merged_data = pd.merge(test, patient_info, on='Patient_ID', how='left')
test_final_merged_data = pd.merge(test_merged_data, camp_info, on='Health_Camp_ID', how='left')

#---------------------------------------------

train_final_merged_data = pd.merge(train_merged_data_2, combined_attended_camps, 
                                   on=['Patient_ID', 'Health_Camp_ID'], how='left')
train_final_merged_data.Health_Score.replace(np.nan, 0, inplace=True)

#-----------------------------------------------

#train_final_merged_data.shape, test_final_merged_data.shape

labels = train_final_merged_data.iloc[:, -1]
X_train = train_final_merged_data.iloc[:, :-1]
data = pd.concat([X_train, test_final_merged_data], sort=False, ignore_index=True)

**Modifying labels for classification**

In [4]:
m = len(labels)

for i in range(m-1):
    if labels[i] > 0.0:
        labels[i] = 1.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [5]:
data.Income.replace('None', 7, inplace=True)   # We are going to treat this column as categorical

#------------------------------------------------------

# Over 80% values are missing or same, that is why dropping them
data.drop(['Education_Score', 'Employer_Category', 'Category3'], axis=1, inplace=True) 

#------------------------------------------------------

data.City_Type.replace(np.nan, 'J', inplace=True) # We are going to treat this column as categorical

#------------------------------------------------------

data.Age.replace('None', 0, inplace=True)
data.Age = data.Age.astype(np.int64)

age = data.Age.copy()

refined_ages = []
l = len(age)

for i in range(l-1):
    if age[i] > 0:
        refined_ages.append(age[i])
        
refined_ages = pd.Series(refined_ages)
median = refined_ages.median()
#refined_ages.hist(bins=10)

data.Age.replace(0, median, inplace=True)

#-----------------------------------------------------------

data['Registration_Date'] = pd.to_datetime(data['Registration_Date'])
data['Camp_End_Date'] = pd.to_datetime(data['Camp_End_Date'])
data['Camp_Start_Date'] = pd.to_datetime(data['Camp_Start_Date'])
data['First_Interaction'] = pd.to_datetime(data['First_Interaction'])

data['decision_duration'] = (data['Registration_Date'] - data['First_Interaction']).dt.days
data['camp_duration'] = (data['Camp_End_Date'] - data['Camp_Start_Date']).dt.days

data.decision_duration.fillna(data.decision_duration.median(), inplace=True)

data.drop(['Registration_Date', 'Camp_End_Date', 'Camp_Start_Date', 'First_Interaction'], axis=1, inplace=True)

data.decision_duration = data.decision_duration.astype(np.float64)
data.camp_duration = data.camp_duration.astype(np.float64)

# Data Preprocessing

**Applying categorical encoding**

In [6]:
data_transformed = pd.get_dummies(data, columns=['Var3', 'Var4', 'Income', 'City_Type', 'Category1', 'Category2'])

**Splitting the data back**

In [7]:
final_train = data_transformed[:train_final_merged_data.shape[0]]
final_test = data_transformed[train_final_merged_data.shape[0]:]

In [8]:
from sklearn.model_selection import train_test_split

X_training, X_cv, y_training, y_cv = train_test_split(final_train, labels, test_size=0.2, random_state=21, shuffle=False)

In [9]:
X_training.shape, X_cv.shape

((60222, 50), (15056, 50))

In [10]:
X_training_identifiers = X_training.iloc[:, [0,1]]
X_cv_identifiers = X_cv.iloc[:, [0,1]]
X_training = X_training.iloc[:, 2:]
X_cv = X_cv.iloc[:, 2:]

final_test_identifiers = final_test.iloc[:, [0,1]]
final_test = final_test.iloc[:, 2:]

**Feature Scaling**

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

scaled_train_columns = sc.fit_transform(X_training.iloc[:, [0, 1, 2, 7, 8, 9]].values)
scaled_cv_columns = sc.transform(X_cv.iloc[:, [0, 1, 2, 7, 8, 9]].values)
scaled_test_columns = sc.transform(final_test.iloc[:, [0, 1, 2, 7, 8, 9]].values)

X_training.loc[:, ['Var1', 'Var2', 'Var5', 'Age', 'decision_duration', 'camp_duration']] = scaled_train_columns
X_cv.loc[:, ['Var1', 'Var2', 'Var5', 'Age', 'decision_duration', 'camp_duration']] = scaled_cv_columns
final_test.loc[:, ['Var1', 'Var2', 'Var5', 'Age', 'decision_duration', 'camp_duration']] =scaled_test_columns


# Training the data and getting results for cross-validation set

In [12]:
X_training.shape, X_cv.shape, final_test.shape

((60222, 48), (15056, 48), (35249, 48))

In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()

param_grid = [{'max_depth': [ 4, 8, 12, 16, 20],
               'min_samples_split': [10, 8, 6, 4, 2],
             }]

grid_search = GridSearchCV(dt_clf, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_training, y_training)

final_model = grid_search.best_estimator_

In [18]:
y_train_pred = final_model.predict(X_training)
train_roc_auc_score = roc_auc_score(y_training, y_train_pred)
print(train_roc_auc_score)

y_cv_pred = final_model.predict(X_cv)
cv_roc_auc_score = roc_auc_score(y_cv, y_cv_pred)
print(cv_roc_auc_score)

0.7307875251462969
0.7181335636936669


# Testing the model on Test Dataset

In [19]:
predictions_test = final_model.predict(final_test)
predictions_test = pd.Series(predictions_test)

frame = {'Patient_ID': test.Patient_ID, 'Health_Camp_ID':test.Health_Camp_ID, 
         'Outcome': predictions_test}
submission = pd.DataFrame(frame)
submission.to_csv('Decision_Tree_submission.csv', index=False)