In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('train_file.csv')
df_test = pd.read_csv('test_file.csv')

In [3]:
df_train['hatype'].value_counts()

1    1704
0    1202
Name: hatype, dtype: int64

### TARGET VARIABLE => hatype : Type of Migraine experienced - Aura =0, No Aura=1

In [4]:
df_train.head()

Unnamed: 0,patient_id,time,dos,age,airq,medication,headache,sex,hatype
0,1,-7.0,757.0,30.0,18.0,continuing,yes,female,0
1,2,-11.0,753.0,30.0,9.0,continuing,yes,female,0
2,3,-6.0,758.0,30.0,19.0,continuing,yes,female,0
3,4,29.0,793.0,30.0,21.0,continuing,yes,female,0
4,5,22.0,786.0,30.0,21.0,continuing,yes,female,0


In [5]:
df_test.head()

Unnamed: 0,patient_id,time,dos,age,airq,medication,headache,sex
0,1,23.0,262,54,20.0,continuing,yes,female
1,2,31.0,795,30,15.0,continuing,yes,female
2,3,28.0,792,30,22.0,continuing,no,female
3,4,26.0,790,30,12.0,continuing,yes,female
4,5,34.0,798,30,20.0,continuing,yes,female


In [6]:
df_train['medication'].value_counts()

continuing    1674
reduced        681
none           535
Name: medication, dtype: int64

In [7]:
df_test['medication'].value_counts()

continuing    691
reduced       297
Name: medication, dtype: int64

##### Train data has an extra subcategory "none" for the variable 'medication'. This can create a problem while predicting values over test data if we convert the "medication" variable to dummy variables. So, we will convert these "none" values to nan values and then impute these using mode. You can choose any other way to handle such a case where there are any extra subcategories in any categorical variable for train dataset.

In [8]:
df_train.loc[df_train['medication'] == 'none','medication'] = np.nan

In [9]:
df_train['medication'].value_counts()

continuing    1674
reduced        681
Name: medication, dtype: int64

### Checking datatypes of the variables

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906 entries, 0 to 2905
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   patient_id  2906 non-null   int64  
 1   time        2899 non-null   float64
 2   dos         2901 non-null   float64
 3   age         2898 non-null   float64
 4   airq        2905 non-null   float64
 5   medication  2355 non-null   object 
 6   headache    2905 non-null   object 
 7   sex         2906 non-null   object 
 8   hatype      2906 non-null   int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 204.5+ KB


### Checking for null values

In [11]:
df_train.isnull().sum()

patient_id      0
time            7
dos             5
age             8
airq            1
medication    551
headache        1
sex             0
hatype          0
dtype: int64

In [12]:
df_test.isnull().sum()

patient_id     0
time           2
dos            0
age            0
airq           0
medication    10
headache       1
sex            0
dtype: int64

### Imputing categorical values

In [13]:
df_train.loc[df_train['medication'].isnull() == True,'medication'] = df_train['medication'].mode()[0]
df_test.loc[df_test['medication'].isnull() == True,'medication'] = df_train['medication'].mode()[0]

df_train.loc[df_train['headache'].isnull() == True,'headache'] = df_train['headache'].mode()[0]
df_test.loc[df_test['headache'].isnull() == True,'headache'] = df_train['headache'].mode()[0]

df_train.loc[df_train['sex'].isnull() == True,'sex'] = df_train['sex'].mode()[0]
df_test.loc[df_test['sex'].isnull() == True,'sex'] = df_train['sex'].mode()[0]

### Dropping unnecessary columns

In [14]:
df_train.drop('patient_id',axis = 1,inplace = True)
test_file_ids = df_test['patient_id'] ## necessary step
df_test.drop('patient_id',axis = 1,inplace = True)

### Encoding Categorical values

In [15]:
df_train = pd.get_dummies(df_train,drop_first = True)
df_train.head()

Unnamed: 0,time,dos,age,airq,hatype,medication_reduced,headache_yes,sex_male
0,-7.0,757.0,30.0,18.0,0,0,1,0
1,-11.0,753.0,30.0,9.0,0,0,1,0
2,-6.0,758.0,30.0,19.0,0,0,1,0
3,29.0,793.0,30.0,21.0,0,0,1,0
4,22.0,786.0,30.0,21.0,0,0,1,0


In [16]:
df_test = pd.get_dummies(df_test,drop_first = True)
df_test.head()

Unnamed: 0,time,dos,age,airq,medication_reduced,headache_yes,sex_male
0,23.0,262,54,20.0,0,1,0
1,31.0,795,30,15.0,0,1,0
2,28.0,792,30,22.0,0,0,0
3,26.0,790,30,12.0,0,1,0
4,34.0,798,30,20.0,0,1,0


In [17]:
cols = df_train.columns

### Imputing continuous values

In [18]:
from sklearn.impute import SimpleImputer
SI = SimpleImputer(strategy='median')
df_train = SI.fit_transform(df_train)
df_train = pd.DataFrame(df_train,columns=cols)

df_test.loc[df_test['time'].isnull() == True,'time'] = df_train['time'].median()

### Splitting training dataset into train and validation set

In [19]:
X = df_train.drop(['hatype'],axis = 1)
y = df_train['hatype']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X,y,test_size = 0.2,random_state = 0)

### Model training - Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion='gini',max_depth=10,random_state=0)
dt_model.fit(X_train,y_train)

y_pred_train = dt_model.predict(X_train)
y_pred_val = dt_model.predict(X_validation)

In [22]:
from sklearn import metrics
dt_acc_train = metrics.accuracy_score(y_train,y_pred_train)
dt_acc_val = metrics.accuracy_score(y_validation,y_pred_val)
print("Train Accuracy:",dt_acc_train)
print("Validation Accuracy:",dt_acc_val)
print()

dt_roc_train = metrics.roc_auc_score(y_train,y_pred_train)
dt_roc_val = metrics.roc_auc_score(y_validation,y_pred_val)
print("Train ROC AUC score:",dt_roc_train)
print("Validation ROC AUC score:",dt_roc_val)
print()

dt_clr_train = metrics.classification_report(y_train,y_pred_train)
dt_clr_val = metrics.classification_report(y_validation,y_pred_val)
print("Train Classification Report:\n",dt_clr_train)
print("Validation Classification Report:\n",dt_clr_val)

Train Accuracy: 0.898881239242685
Validation Accuracy: 0.8917525773195877

Train ROC AUC score: 0.886778009107069
Validation ROC AUC score: 0.8758522727272728

Train Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.81      0.87       972
         1.0       0.88      0.96      0.92      1352

    accuracy                           0.90      2324
   macro avg       0.91      0.89      0.89      2324
weighted avg       0.90      0.90      0.90      2324

Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.80      0.85       230
         1.0       0.88      0.95      0.91       352

    accuracy                           0.89       582
   macro avg       0.90      0.88      0.88       582
weighted avg       0.89      0.89      0.89       582



### Model training - Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=10,criterion='gini',max_depth=10,random_state=0)
rf_model.fit(X_train,y_train)

y_pred_train = rf_model.predict(X_train)
y_pred_val = rf_model.predict(X_validation)

In [24]:
from sklearn import metrics
dt_acc_train = metrics.accuracy_score(y_train,y_pred_train)
dt_acc_val = metrics.accuracy_score(y_validation,y_pred_val)
print("Train Accuracy:",dt_acc_train)
print("Validation Accuracy:",dt_acc_val)
print()

dt_roc_train = metrics.roc_auc_score(y_train,y_pred_train)
dt_roc_val = metrics.roc_auc_score(y_validation,y_pred_val)
print("Train ROC AUC score:",dt_roc_train)
print("Validation ROC AUC score:",dt_roc_val)
print()

dt_clr_train = metrics.classification_report(y_train,y_pred_train)
dt_clr_val = metrics.classification_report(y_validation,y_pred_val)
print("Train Classification Report:\n",dt_clr_train)
print("Validation Classification Report:\n",dt_clr_val)

Train Accuracy: 0.9707401032702238
Validation Accuracy: 0.9484536082474226

Train ROC AUC score: 0.9692134195339324
Validation ROC AUC score: 0.9453310276679842

Train Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.96      0.96       972
         1.0       0.97      0.98      0.97      1352

    accuracy                           0.97      2324
   macro avg       0.97      0.97      0.97      2324
weighted avg       0.97      0.97      0.97      2324

Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.93      0.93       230
         1.0       0.95      0.96      0.96       352

    accuracy                           0.95       582
   macro avg       0.95      0.95      0.95       582
weighted avg       0.95      0.95      0.95       582



### Predicting on test data using RF model and creating a submission file that needs to be uploaded to the GL Hackathon Platform

In [25]:
final_predictions = rf_model.predict(df_test)
submission=pd.DataFrame([test_file_ids,final_predictions]).T
submission.rename(columns={"Unnamed 0": "hatype"},inplace=True)
submission.to_csv('submission.csv',index = False)

##### Once this "submission.csv" file is created, you are supposed to submit it on the GL Hackathon Platform. Once it is submitted, the platform calculates the performance metric as per the hackathon  and your score will reflect on Leaderboard with your respective rank.