In [1]:
csvfile = "dataV2_cohortV3.csv"

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv(csvfile)

In [4]:
# Mapping the variables to numeric values
mapping_dict = {
    'WELLNESS_malach_pines_burnout_measure_depressed': {
        'Never': 0,
        'Almost never': 1,
        'Rarely': 2,
        'Sometimes': 3,
        'Very Often': 4,
        'Always': 5
    }
}

In [5]:
# Map values and rename

# Create columns for OnlineGames and SocialFriends with 'C' (text) and numeric values
df['OnlineGamesC'] = df['CONNECTION_activities_onlinegames_p3m']

df['SocialFriendsC'] = df['CONNECTION_social_time_friends_p7d_grouped']

# Create 'Depression' column with numeric values only
df['DepressionC'] = df['WELLNESS_malach_pines_burnout_measure_depressed']
df['DepressionN'] = df['WELLNESS_malach_pines_burnout_measure_depressed'].map(mapping_dict['WELLNESS_malach_pines_burnout_measure_depressed'])



# Drop the original columns from df
keep_cols = ['OnlineGamesC', 'SocialFriendsC', 'DepressionC', 'DepressionN']
df = df[keep_cols]

In [6]:
# Drop NA
df = df.dropna()

In [7]:
# Reorder the categories for OnlineGamesC to make "Not in the past three months" the reference category
df['OnlineGamesC'] = pd.Categorical(df['OnlineGamesC'],
                                           categories=['Not in the past three months', 'Less than monthly', 'Monthly', 
                                                       'A few times a month', 'Weekly', 'A few times a week', 
                                                       'Daily or almost daily'],
                                           ordered=True)

# Reorder the categories for SocialFriendsC to make "No time" the reference category
df['SocialFriendsC'] = pd.Categorical(df['SocialFriendsC'],
                                            categories=['No time', 'Less than 1 hour', '1 to 4 hours', '5 or more hours'],
                                            ordered=True)


In [8]:
df["OnlineGamesB"] = df["OnlineGamesC"] != "Not in the past three months"
df['SocialFriendsB'] = df['SocialFriendsC'] != 'No time'

In [None]:
df

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
len(train_df), len(test_df)

In [None]:
# Fit the OLS model using categorical values (OnlineGamesC and SocialFriendsC)
ols_model_c = smf.ols("DepressionN ~ C(OnlineGamesC) + C(SocialFriendsC)", data=train_df)

fitted_ols_model_c = ols_model_c.fit()
# Display the regression results
fitted_ols_model_c.summary()

In [None]:
# Fit the OLS model using categorical values (OnlineGamesB and SocialFriendsC)
# Binary values for online games
ols_model_b = smf.ols("DepressionN ~ C(OnlineGamesB) + C(SocialFriendsC)", data=train_df)

fitted_ols_model_b = ols_model_b.fit()

# Display the regression results
fitted_ols_model_b.summary()


In [13]:
clf = DecisionTreeClassifier(max_depth=10, random_state=10)

y_train = train_df['DepressionC']
X_train = pd.get_dummies(train_df[['OnlineGamesC', 'SocialFriendsC', 'OnlineGamesB', 'SocialFriendsB']])

y_test = test_df['DepressionC']
X_test = pd.get_dummies(test_df[['OnlineGamesC', 'SocialFriendsC', 'OnlineGamesB', 'SocialFriendsB']])

In [None]:
clf.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(figsize=(100, 100))
plot_tree(clf, ax=ax, fontsize=22, feature_names=X_train.columns, class_names=clf.classes_)
plt.show()

In [None]:
cm_display = ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test)
cm = cm_display.confusion_matrix

In [None]:
y_pred = clf.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# sensitivity = recall_score(y_test, y_pred, average='macro')
# specificity = recall_score(y_test, y_pred, pos_label='Almost never', average='macro')
# precision = precision_score(y_test, y_pred, average='macro')

In [None]:
# print(f"accuracy: {accuracy}")
# print(f"sensitivity: {sensitivity}")
# print(f"specificity: {specificity}")
# print(f"precision: {precision}")