In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
df = pd.read_csv('/kaggle/input/medical-student-mental-health/Data Carrard et al. 2022 MedTeach.csv')
codebook = pd.read_csv('/kaggle/input/medical-student-mental-health/Codebook Carrard et al. 2022 MedTeach.csv', sep=';')

In [None]:
codebook.head(5)

In [None]:
def create_groups(value):
    cleaned = [i.strip(" ") for i in value.split(";")]
    grouped_dict = dict()
    for i in cleaned:
        div = i.split("=")
        grouped_dict[int(div[0])] = div[1]
    return grouped_dict

In [None]:
maps = dict()
columns = codebook.iloc[:, 0].values
ids = [2, 3, 4, 5, 6, 8, 9]
count = 0
for i in codebook.iloc[ids, 2]:
    maps[columns[ids[count]]] = create_groups(i)
    count += 1

Mapping encoded values from the dataset for pie charts and frequency counts with clear distinction and labellings

In [None]:
maps

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df_classes = df.copy().iloc[:, ids]

In [None]:
for i in df_classes.columns:
    df_classes[i] = df_classes[i].map(maps[i])

In [None]:
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(20, 25))
cols = df_classes.columns
for i in range(0, 3):
    freqs = df_classes[cols[i]].value_counts()
    axes[i].pie(freqs, labels=freqs.index, autopct="%0.2f%%")
    axes[i].set_title(cols[i])
fig, axes = plt.subplots(ncols=4, nrows=1, figsize=(20, 25))
for i in range(3, df_classes.shape[1]):
    freqs = df_classes[cols[i]].value_counts()
    axes[i-3].pie(freqs, labels=freqs.index, autopct="%0.2f%%")
    axes[i-3].set_title(cols[i])

In [None]:
lang_freqs = df_classes['glang'].value_counts()
lang_freqs

Mental health satisfaction among different groups

In [None]:
numerical = [1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
sns.pairplot(df, hue='psyt', vars=df.columns[numerical])

In [None]:
# Suggestion is to make a binary classification on whether student will take terapy or not, but the dataset is most likely to
# be unbalanced in that sense. Hence, we would need to later apply SMOTE technique or any other techniques to balanced the 
# dataset in a sense of classes
# I am planning on applying GaussianNB and Tree structures for classification tasks.
# We will use features importance of Random Forest Classifier class to determine most important features

In [None]:
def model_evaluation(model, test_sample, truth, categorical=True):
    yhat = model.predict(test_sample)
    score = accuracy_score(yhat, truth)
    text = "Accuracy score for the model {}%".format(round(score*100, 2))
    if categorical:
        cm = confusion_matrix(yhat, truth)
        sns.heatmap(cm, annot=True)
        plt.title(text)
    else:
        dd = dict()
        for i in range(test_sample.shape[1]):
            name = "data:{}".format(i)
            dd[name] = test_sample[:, i]
#         dd['target'] = truth
        dd['predicted'] = yhat
        print(text)
        df = pd.DataFrame(dd)
#         fig, axes = plt.subplots(nrows=2, ncols=1)
        sns.pairplot(df, hue='predicted', vars=df.columns[:-1])
        plt.title(text)
        plt.show()

Determining Important Features

In [None]:
temp = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
X = df.iloc[:, temp].values
y = df.iloc[:, 9].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
# ids[:-1]+numerical
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

forest_importances = pd.Series(forest.feature_importances_, index=df.columns[temp])

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

Selecting Features with decrease of impurity more than 0.04. And applying SMOTE technique to get rid of class imbalance

In [None]:
temp = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
x = df.iloc[:, temp].values
y = df.loc[:, 'psyt'].values
oversample = SMOTE()
x_all, y_all = oversample.fit_resample(x, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_all, y_all, random_state=42, test_size=0.2)
# ids[:-1]+numerical
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

forest_importances = pd.Series(forest.feature_importances_, index=df.columns[temp])

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
temp_os = np.hstack((x_all, y_all.reshape(-1, 1)))
df_os = pd.DataFrame(temp_os, columns=df.columns[temp+[9]])

In [None]:
temp.pop(3)
temp.pop(3)
x, y = df.iloc[:, temp].values, df.loc[:, 'psyt'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

In [None]:
rfc = RandomForestClassifier(max_depth=20, max_features='sqrt')
rfc.fit(x_train, y_train)
model_evaluation(rfc, x_test, y_test)

In [None]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
model_evaluation(gnb, x_test, y_test)

In [None]:
dtc = DecisionTreeClassifier(max_depth=20, max_features="sqrt")
dtc.fit(x_train, y_train)
model_evaluation(dtc, x_test, y_test)

As we can see. Random Forest Classifier outperforms classical Decision Tree Algorithm.
I decided to use Tree classifiers and GaussianNB algorithm because I assumed all of the features of the dataset are important, both categorical and numerical ones. If we used algorithms that are dependant on linear dimension, such as SVMs, KNN, Logistic Regression and etc. we would require to have more of numerical inputs, rather than categorical inputs. However, some of them, as I expected, would make the classification task a bit messier: glang(because there is a huge number of categories) and part column, because it has bigger amount of impurity. That is why I got rid of them and proceeded with classification tasks.