### Data Collection And Preparation

### Importing the libraries

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost
import pickle as pkl
from sklearn.model_selection import cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings ("ignore")

In [None]:
class MultiColumnLabelEncoder:

    def __init__(self, columns=None):
        self.columns = columns # array of column names to encode


    def fit(self, X, y=None):
        self.encoders = {}
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            self.encoders[col] = LabelEncoder().fit(X[col])
        return self


    def transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].transform(X[col])
        return output


    def fit_transform(self, X, y=None):
        return self.fit(X,y).transform(X)


    def inverse_transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].inverse_transform(X[col])
        return output

## Read the Datasets

In [None]:
train = pd.read_csv('panic_disorder_dataset_training.csv')
train.head()

In [None]:
test = pd.read_csv('panic_disorder_dataset_testing.csv')
test.head()

In [None]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

In [None]:
train.info()

In [None]:
test.info()

## Handling Missing Value

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.isnull().any()

In [None]:
test.isnull().any()

In [None]:
train["Medical History"].unique()

In [None]:
train["Psychiatric History"].unique()

In [None]:
train["Substance Use"].unique()

In [None]:
test["Medical History"].unique()

In [None]:
test["Psychiatric History"].unique()

In [None]:
test["Substance Use"].unique()

### Fill null values

In [None]:
train["Medical History"].fillna("none", inplace=True)
train["Medical History"].unique()

In [None]:
train["Psychiatric History"].fillna("none", inplace=True)
train["Psychiatric History"].unique()

In [None]:
train["Substance Use"].fillna("none", inplace=True)
train["Substance Use"].unique()

In [None]:
test["Medical History"].fillna("none", inplace=True)
test["Medical History"].unique()

In [None]:
test["Psychiatric History"].fillna("none", inplace=True)
test["Psychiatric History"].unique()

In [None]:
test["Substance Use"].fillna("none", inplace=True)
test["Substance Use"].unique()

### Checking null values again

In [None]:
train.isnull().any()

In [None]:
test.isnull().any()

## Handling Categorical Values

In [None]:
#categorical values
cat_cols=[]
for i in train.columns:
    if train[i].dtype=='object':
        cat_cols.append(i)
cat_cols

In [None]:
#Label Encoding
multi = MultiColumnLabelEncoder(columns= cat_cols)

train = multi.fit_transform(train)

test =  multi.fit_transform(test)

## SMOTE on Panic Disorder Diagnosis - Over Sampling The Data

In [None]:
print(train["Panic Disorder Diagnosis"].value_counts())
print(test["Panic Disorder Diagnosis"].value_counts())

In [None]:
smote=SMOTE()

In [None]:
y_train = train["Panic Disorder Diagnosis"]
x_train = train.drop(columns=['Participant ID','Panic Disorder Diagnosis'],axis=1)

In [None]:
y_test = test["Panic Disorder Diagnosis"]
x_test = test.drop(columns=['Participant ID','Panic Disorder Diagnosis'],axis=1)

In [None]:
x_res_train,y_res_train = smote.fit_resample(x_train,y_train)

In [None]:
print(y_train.value_counts())
print(y_res_train.value_counts())

In [None]:
x_res_train

In [None]:
y_res_train

## Feature Selection

In [None]:
#Calculating Fscore and p value
f_p_values=chi2(x_res_train,y_res_train)
f_p_values
# Here first array has F values and second array has P values.

In [None]:
#p Values
p_values=pd.Series(f_p_values[1])
p_values.index= x_train.columns
p_values.sort_index(ascending=True)
# We sort the P values array based on the Chi scores and select moderately important and least important feature/s.

# Exploratory Data Analysis

## Descriptive Statistical

In [None]:
# reverse transforming train dataframe
train_invt = multi.inverse_transform(train)

train_invt.describe(include='all')

## Age

In [None]:
plt.hist(train_invt['Age'])

In [None]:
sns.distplot(train_invt.Age)

## Gender

In [None]:
y_axis = train_invt['Gender'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Family History

In [None]:
y_axis = train_invt['Family History'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Personal History

In [None]:
y_axis = train_invt['Personal History'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Current Stressors

In [None]:
y_axis = train_invt['Current Stressors'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Symptoms

In [None]:
y_axis = train_invt['Symptoms'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Severity

In [None]:
y_axis = train_invt['Severity'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Impact on Life

In [None]:
y_axis = train_invt['Impact on Life'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Demographics

In [None]:
y_axis = train_invt['Demographics'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Medical History

In [None]:
y_axis = train_invt['Medical History'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Psychiatric History

In [None]:
y_axis = train_invt['Psychiatric History'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Substance Use

In [None]:
y_axis = train_invt['Substance Use'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Coping Mechanisms

In [None]:
y_axis = train_invt['Coping Mechanisms'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Social Support

In [None]:
y_axis = train_invt['Social Support'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Lifestyle Factors

In [None]:
y_axis = train_invt['Lifestyle Factors'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

## Panic Disorder Diagnosis

In [None]:
y_axis = train_invt['Panic Disorder Diagnosis'].value_counts()
x_axis = y_axis.index.values
bars = plt.bar(x_axis,y_axis)
plt.bar_label(bars)

In [None]:
sns.countplot(data=train_invt,x='Panic Disorder Diagnosis',palette=['green','red'])


### Bivariate Variables 

In [None]:
plt.figure(figsize=(20,20))

plt.subplot(5,3,1)
y_axis = train_invt['Age'].value_counts()
plt.hist(train['Age'])
plt.title('Age')

plt.subplot(5,3,2)
y_axis = train_invt['Gender']
plt.hist(y_axis)
plt.title('Gender')


plt.subplot(5,3,3)
y_axis = train_invt['Family History']
plt.hist(y_axis)
plt.title('Family History')


plt.subplot(5,3,4)
y_axis = train_invt['Personal History']
plt.hist(y_axis)
plt.title('Personal History')


plt.subplot(5,3,5)
y_axis = train_invt['Current Stressors'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Current Stressors')


plt.subplot(5,3,6)
y_axis = train_invt['Symptoms'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Symptoms')


plt.subplot(5,3,7)
y_axis = train_invt['Severity'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Age')


plt.subplot(5,3,8)
y_axis = train_invt['Impact on Life'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Severity')


plt.subplot(5,3,9)
y_axis = train_invt['Demographics'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Demographics')


plt.subplot(5,3,10)
y_axis = train_invt['Medical History'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Medical History')


plt.subplot(5,3,11)
y_axis = train_invt['Psychiatric History'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Psychiatric History')


plt.subplot(5,3,12)
y_axis = train_invt['Substance Use'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Substance Use')


plt.subplot(5,3,13)
y_axis = train_invt['Coping Mechanisms'].value_counts()
plt.pie(y_axis, labels=y_axis.index)
plt.title('Coping Mechanisms')


plt.subplot(5,3,14)
y_axis = train_invt['Social Support'].value_counts()
plt.pie(y_axis, labels=y_axis.index)
plt.title('Social Support')


plt.subplot(5,3,15)
y_axis = train_invt['Lifestyle Factors'].value_counts()
plt.pie(y_axis, labels = y_axis.index)
plt.title('Lifestyle Factors')



## Splitting the Data

In [None]:
print(x_res_train.shape,y_res_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
x_res_train

In [None]:
y_res_train

## Writing Function to Train the Models 

In [None]:
#temp_x = x_res_train
#temp_y = y_res_train

def train_models_eval(x_res_train,y_res_train,fts):
    print("\n---›RANDOM FOREST" )
    rf = RandomForestClassifier(random_state=1234)
    rf.fit(x_res_train[fts], y_res_train)
    y_pred=rf.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",rf.score(x_test[fts],y_test))
    
    print("\n---›DECISION TREE")
    dtf = DecisionTreeClassifier(random_state=1234)
    dtf.fit(x_res_train[fts], y_res_train)
    y_pred=dtf.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",dtf.score(x_test[fts],y_test))
    
    print(" \n--->KNN")
    knn = KNeighborsClassifier()
    knn.fit(x_res_train[fts], y_res_train)
    y_pred=knn.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",knn.score(x_test[fts],y_test))
    
    print("\n---›EXTRAS TREES CLASSIFIER")
    etc = ExtraTreesClassifier(random_state=1234)
    etc.fit(x_res_train[fts],y_res_train)
    y_pred=etc.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:",etc.score(x_test[fts],y_test))
    
    print("\n---›XGBOOST" )
    xgb = xgboost.XGBClassifier ()
    xgb.fit(x_res_train[fts], y_res_train)
    y_pred=xgb.predict(x_test[fts])
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("SCORE:", xgb.score(x_test[fts],y_test))
    
    return rf,dtf,knn,etc,xgb

In [None]:
fts = ['Gender',
 'Family History',
 'Personal History',
 'Current Stressors',
 'Symptoms',
 'Severity',
 'Impact on Life',
 'Demographics',
 'Medical History',
 'Psychiatric History',
 'Substance Use',
 'Coping Mechanisms',
 'Social Support',
 'Lifestyle Factors']
rf,dtf,knn,etc,xgb = train_models_eval(x_res_train,y_res_train,fts)

In [None]:
knn.predict([x_test[fts].iloc[7,:]])

In [None]:
y_test[7]

## Build Model

In [None]:
pkl.dump(dtf, open('dtf.pkl','wb'))

# Hyperparameter Tuning

### Decision Tree

In [None]:
dtf.get_params()

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5,10,15],
    'min_samples_split': [2,3,10],
    'min_samples_leaf': [1,2,3],
    'max_features': [None, 'sqrt', 'log2']
}

#creating a decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1234)

#Create GridSearchCV object
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, verbose = 1, n_jobs=-1)

#Fit the data to perform grid search
grid_search.fit(x_res_train[fts], y_res_train)

#Print the best hyperparameteers
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score", grid_search.best_score_)


In [None]:
y_pred = grid_search.best_estimator_.predict(x_test[fts])
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("SCORE:",grid_search.best_estimator_.score(x_test[fts],y_test))

### Random Forest

In [None]:
rf.get_params()

In [None]:
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [None, 5,10],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
    'max_features': ['sqrt', 'log2']
}

#creating a Ramdom Forest Classifier
rf_classifier = RandomForestClassifier(random_state=1234)

#Create GridSearchCV object
grid_search = GridSearchCV(rf_classifier, param_grid=param_grid, cv=5, verbose = 1, n_jobs=4)

#Fit the data to perform grid search
grid_search.fit(x_res_train[fts], y_res_train)

#Print the best hyperparameteers
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score", grid_search.best_score_)


In [None]:
y_pred = grid_search.best_estimator_.predict(x_test[fts])
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("SCORE:",grid_search.best_estimator_.score(x_test[fts],y_test))

### XGBoost

In [None]:
xgb.get_params()

In [None]:
param_grid = {
    'min_child_weight': [10,20],
    'gamma': [0,1.5,2.0],
    'colsample_bytree': [0.6,0.8,0.9],
    'max_depth': [4,5,6],
}

xgb = xgboost.XGBClassifier(learning_rate=0.5, n_estimators=100, objective='binary:logistic', nthread=3)

fitmodel = GridSearchCV(xgb, param_grid=param_grid, cv=5, refit=True, scoring = 'accuracy', n_jobs=4, verbose=3)

#Fit the data to perform grid search
fitmodel.fit(x_res_train[fts], y_res_train)

#Print the best hyperparameteers
print("Best Hyperparameters:", fitmodel.best_params_)
print("Best Score", fitmodel.best_score_)


In [None]:
y_pred = fitmodel.best_estimator_.predict(x_test[fts])
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("SCORE:",fitmodel.best_estimator_.score(x_test[fts],y_test))

In [None]:
y_pred = grid_search.best_estimator_.predict(x_test[fts])
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("SCORE:",grid_search.best_estimator_.score(x_test[fts],y_test))