# Import libraries and Data

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.tree import DecisionTreeClassifier 
import xgboost
import pickle as pkl
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings ("ignore")

In [3]:
train = pd.read_csv('panic_disorder_dataset_training.csv')
train.head()

Unnamed: 0,Participant ID,Age,Gender,Family History,Personal History,Current Stressors,Symptoms,Severity,Impact on Life,Demographics,Medical History,Psychiatric History,Substance Use,Coping Mechanisms,Social Support,Lifestyle Factors,Panic Disorder Diagnosis
0,1,38,Male,No,Yes,Moderate,Shortness of breath,Mild,Mild,Rural,Diabetes,Bipolar disorder,,Socializing,High,Sleep quality,0
1,2,51,Male,No,No,High,Panic attacks,Mild,Mild,Urban,Asthma,Anxiety disorder,Drugs,Exercise,High,Sleep quality,0
2,3,32,Female,Yes,No,High,Panic attacks,Mild,Significant,Urban,Diabetes,Depressive disorder,,Seeking therapy,Moderate,Exercise,0
3,4,64,Female,No,No,Moderate,Chest pain,Moderate,Moderate,Rural,Diabetes,,,Meditation,High,Exercise,0
4,5,31,Male,Yes,No,Moderate,Panic attacks,Mild,Moderate,Rural,Asthma,,Drugs,Seeking therapy,Low,Sleep quality,0


In [4]:
test = pd.read_csv('panic_disorder_dataset_testing.csv')
test.head()

Unnamed: 0,Participant ID,Age,Gender,Family History,Personal History,Current Stressors,Symptoms,Severity,Impact on Life,Demographics,Medical History,Psychiatric History,Substance Use,Coping Mechanisms,Social Support,Lifestyle Factors,Panic Disorder Diagnosis
0,1,41,Male,Yes,No,High,Shortness of breath,Mild,Mild,Urban,Diabetes,Bipolar disorder,Alcohol,Seeking therapy,Low,Exercise,0
1,2,20,Female,Yes,No,Low,Shortness of breath,Mild,Significant,Urban,Asthma,Anxiety disorder,Drugs,Exercise,High,Diet,0
2,3,32,Male,Yes,Yes,High,Panic attacks,Severe,Mild,Rural,Heart disease,Bipolar disorder,Drugs,Meditation,Moderate,Exercise,0
3,4,41,Female,Yes,Yes,Moderate,Shortness of breath,Moderate,Significant,Urban,Heart disease,Anxiety disorder,,Exercise,High,Sleep quality,0
4,5,36,Female,Yes,No,High,Chest pain,Severe,Significant,Rural,Asthma,Depressive disorder,,Seeking therapy,Low,Exercise,0


In [5]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

Train data shape: (100000, 17)
Test data shape: (20000, 17)


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Participant ID            100000 non-null  int64 
 1   Age                       100000 non-null  int64 
 2   Gender                    100000 non-null  object
 3   Family History            100000 non-null  object
 4   Personal History          100000 non-null  object
 5   Current Stressors         100000 non-null  object
 6   Symptoms                  100000 non-null  object
 7   Severity                  100000 non-null  object
 8   Impact on Life            100000 non-null  object
 9   Demographics              100000 non-null  object
 10  Medical History           74827 non-null   object
 11  Psychiatric History       75079 non-null   object
 12  Substance Use             66626 non-null   object
 13  Coping Mechanisms         100000 non-null  object
 14  Socia

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Participant ID            20000 non-null  int64 
 1   Age                       20000 non-null  int64 
 2   Gender                    20000 non-null  object
 3   Family History            20000 non-null  object
 4   Personal History          20000 non-null  object
 5   Current Stressors         20000 non-null  object
 6   Symptoms                  20000 non-null  object
 7   Severity                  20000 non-null  object
 8   Impact on Life            20000 non-null  object
 9   Demographics              20000 non-null  object
 10  Medical History           14999 non-null  object
 11  Psychiatric History       15011 non-null  object
 12  Substance Use             13383 non-null  object
 13  Coping Mechanisms         20000 non-null  object
 14  Social Support        

# Check null values

In [9]:
train.isnull().sum()

Participant ID                  0
Age                             0
Gender                          0
Family History                  0
Personal History                0
Current Stressors               0
Symptoms                        0
Severity                        0
Impact on Life                  0
Demographics                    0
Medical History             25173
Psychiatric History         24921
Substance Use               33374
Coping Mechanisms               0
Social Support                  0
Lifestyle Factors               0
Panic Disorder Diagnosis        0
dtype: int64

In [10]:
test.isnull().sum()

Participant ID                 0
Age                            0
Gender                         0
Family History                 0
Personal History               0
Current Stressors              0
Symptoms                       0
Severity                       0
Impact on Life                 0
Demographics                   0
Medical History             5001
Psychiatric History         4989
Substance Use               6617
Coping Mechanisms              0
Social Support                 0
Lifestyle Factors              0
Panic Disorder Diagnosis       0
dtype: int64

In [11]:
train.isnull().any()

Participant ID              False
Age                         False
Gender                      False
Family History              False
Personal History            False
Current Stressors           False
Symptoms                    False
Severity                    False
Impact on Life              False
Demographics                False
Medical History              True
Psychiatric History          True
Substance Use                True
Coping Mechanisms           False
Social Support              False
Lifestyle Factors           False
Panic Disorder Diagnosis    False
dtype: bool

In [12]:
test.isnull().any()

Participant ID              False
Age                         False
Gender                      False
Family History              False
Personal History            False
Current Stressors           False
Symptoms                    False
Severity                    False
Impact on Life              False
Demographics                False
Medical History              True
Psychiatric History          True
Substance Use                True
Coping Mechanisms           False
Social Support              False
Lifestyle Factors           False
Panic Disorder Diagnosis    False
dtype: bool

In [13]:
train["Medical History"].unique()

array(['Diabetes', 'Asthma', nan, 'Heart disease'], dtype=object)

In [14]:
train["Psychiatric History"].unique()

array(['Bipolar disorder', 'Anxiety disorder', 'Depressive disorder', nan],
      dtype=object)

In [15]:
train["Substance Use"].unique()

array([nan, 'Drugs', 'Alcohol'], dtype=object)

In [16]:
test["Medical History"].unique()

array(['Diabetes', 'Asthma', 'Heart disease', nan], dtype=object)

In [17]:
test["Psychiatric History"].unique()

array(['Bipolar disorder', 'Anxiety disorder', 'Depressive disorder', nan],
      dtype=object)

In [18]:
test["Substance Use"].unique()

array(['Alcohol', 'Drugs', nan], dtype=object)

# Fill null values

In [20]:
train["Medical History"].fillna("none", inplace=True)
train["Medical History"].unique()

array(['Diabetes', 'Asthma', 'none', 'Heart disease'], dtype=object)

In [21]:
train["Psychiatric History"].fillna("none", inplace=True)
train["Psychiatric History"].unique()

array(['Bipolar disorder', 'Anxiety disorder', 'Depressive disorder',
       'none'], dtype=object)

In [22]:
train["Substance Use"].fillna("none", inplace=True)
train["Substance Use"].unique()

array(['none', 'Drugs', 'Alcohol'], dtype=object)

In [23]:
test["Medical History"].fillna("none", inplace=True)
test["Medical History"].unique()

array(['Diabetes', 'Asthma', 'Heart disease', 'none'], dtype=object)

In [24]:
test["Psychiatric History"].fillna("none", inplace=True)
test["Psychiatric History"].unique()

array(['Bipolar disorder', 'Anxiety disorder', 'Depressive disorder',
       'none'], dtype=object)

In [25]:
test["Substance Use"].fillna("none", inplace=True)
test["Substance Use"].unique()

array(['Alcohol', 'Drugs', 'none'], dtype=object)

# Check null values again

In [27]:
train.isnull().any()

Participant ID              False
Age                         False
Gender                      False
Family History              False
Personal History            False
Current Stressors           False
Symptoms                    False
Severity                    False
Impact on Life              False
Demographics                False
Medical History             False
Psychiatric History         False
Substance Use               False
Coping Mechanisms           False
Social Support              False
Lifestyle Factors           False
Panic Disorder Diagnosis    False
dtype: bool

In [28]:
test.isnull().any()

Participant ID              False
Age                         False
Gender                      False
Family History              False
Personal History            False
Current Stressors           False
Symptoms                    False
Severity                    False
Impact on Life              False
Demographics                False
Medical History             False
Psychiatric History         False
Substance Use               False
Coping Mechanisms           False
Social Support              False
Lifestyle Factors           False
Panic Disorder Diagnosis    False
dtype: bool

# Label Encoding

In [30]:
#categorical values
cat_cols=[]
for i in train.columns:
    if train[i].dtype=='object':
        cat_cols.append(i)
cat_cols

['Gender',
 'Family History',
 'Personal History',
 'Current Stressors',
 'Symptoms',
 'Severity',
 'Impact on Life',
 'Demographics',
 'Medical History',
 'Psychiatric History',
 'Substance Use',
 'Coping Mechanisms',
 'Social Support',
 'Lifestyle Factors']

In [31]:
#Label Encoding
le={}
for column in train.columns:
    if train[column].dtype== object:
        le[column]={}
        c=0
        
        for i in train[column].unique():
            le[column][i]=c
            c+=1
        train[column]=train[column].map(le[column])

le={}
for column in test.columns:
    if test[column].dtype== object:
        le[column]={}
        c=0
        
        for i in test[column].unique():
            le[column][i]=c
            c+=1
        test[column]=test[column].map(le[column])

# SMOTE on Panic Disorder Diagnosis|

In [33]:
train["Panic Disorder Diagnosis"].value_counts()

Panic Disorder Diagnosis
0    95715
1     4285
Name: count, dtype: int64

In [34]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()

In [35]:
y = train["Panic Disorder Diagnosis"]
x = train.drop(columns=['Participant ID','Panic Disorder Diagnosis'],axis=1)

In [36]:
x_res,y_res = smote.fit_resample(x,y)

In [37]:
print(y.value_counts())
print(y_res.value_counts())

Panic Disorder Diagnosis
0    95715
1     4285
Name: count, dtype: int64
Panic Disorder Diagnosis
0    95715
1    95715
Name: count, dtype: int64


In [38]:
x_res

Unnamed: 0,Age,Gender,Family History,Personal History,Current Stressors,Symptoms,Severity,Impact on Life,Demographics,Medical History,Psychiatric History,Substance Use,Coping Mechanisms,Social Support,Lifestyle Factors
0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,51,0,0,1,1,1,0,0,1,1,1,1,1,0,0
2,32,1,1,1,1,1,0,1,1,0,2,0,2,1,1
3,64,1,0,1,0,2,1,2,0,0,3,0,3,0,1
4,31,0,1,1,0,1,0,2,0,1,3,1,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191425,52,0,1,0,1,2,2,1,0,2,1,1,2,0,0
191426,34,0,1,0,0,1,1,1,0,0,2,1,0,0,0
191427,54,1,0,0,1,1,2,2,0,2,2,2,0,0,0
191428,47,0,0,0,1,1,2,0,0,3,0,2,1,2,0


In [39]:
y_res

0         0
1         0
2         0
3         0
4         0
         ..
191425    1
191426    1
191427    1
191428    1
191429    1
Name: Panic Disorder Diagnosis, Length: 191430, dtype: int64

In [40]:
x

Unnamed: 0,Age,Gender,Family History,Personal History,Current Stressors,Symptoms,Severity,Impact on Life,Demographics,Medical History,Psychiatric History,Substance Use,Coping Mechanisms,Social Support,Lifestyle Factors
0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,51,0,0,1,1,1,0,0,1,1,1,1,1,0,0
2,32,1,1,1,1,1,0,1,1,0,2,0,2,1,1
3,64,1,0,1,0,2,1,2,0,0,3,0,3,0,1
4,31,0,1,1,0,1,0,2,0,1,3,1,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,22,0,1,1,1,2,0,0,0,3,3,0,0,2,2
99996,57,1,0,0,2,1,2,0,0,3,2,0,3,0,2
99997,20,0,1,1,0,1,2,2,0,3,0,0,2,2,1
99998,56,1,1,0,1,2,2,0,0,0,1,1,0,0,0


In [41]:
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    1
99999    0
Name: Panic Disorder Diagnosis, Length: 100000, dtype: int64

In [42]:
"""
indices = np.random.choice(x_train[y_train['Panic Disorder Diagnosis']==0].index,
                           size=x_train[y_train['Panic Disorder Diagnosis']==1].shape[0],
                           replace=False)
temp_x = x_train[y_train['Panic Disorder Diagnosis']==0].loc[indices]
temp_x = temp_x.append(x_train[y_train['Panic Disorder Diagnosis']==1])
temp_y = y_train[y_train['Panic Disorder Diagnosis']==0].loc[indices]
temp_y = temp_y.append(y_train[y_train['Panic Disorder Diagnosis']==1])
print(x_train.shape, y_train.shape)
print("Before balancing", Counter(y_train['Panic Disorder Diagnosis']))
print(temp_x.shape, temp_y.shape)
print("After balancing", Counter(temp_y['Panic Disorder Diagnosis']))
"""

'\nindices = np.random.choice(x_train[y_train[\'Panic Disorder Diagnosis\']==0].index,\n                           size=x_train[y_train[\'Panic Disorder Diagnosis\']==1].shape[0],\n                           replace=False)\ntemp_x = x_train[y_train[\'Panic Disorder Diagnosis\']==0].loc[indices]\ntemp_x = temp_x.append(x_train[y_train[\'Panic Disorder Diagnosis\']==1])\ntemp_y = y_train[y_train[\'Panic Disorder Diagnosis\']==0].loc[indices]\ntemp_y = temp_y.append(y_train[y_train[\'Panic Disorder Diagnosis\']==1])\nprint(x_train.shape, y_train.shape)\nprint("Before balancing", Counter(y_train[\'Panic Disorder Diagnosis\']))\nprint(temp_x.shape, temp_y.shape)\nprint("After balancing", Counter(temp_y[\'Panic Disorder Diagnosis\']))\n'