In [13]:
csvfile = "dataV2_cohortV3.csv"

In [14]:
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from mpl_toolkits.mplot3d import Axes3D

In [15]:
df = pd.read_csv(csvfile)

In [16]:
# Mapping the variables to numeric values
mapping_dict = {
    'CONNECTION_activities_onlinegames_p3m': {
        'Not in the past three months': 0,
        'Less than monthly': 0.5,
        'Monthly': 1,
        'A few times a month': 2,
        'Weekly': 4,
        'A few times a week': 10,
        'Daily or almost daily': 20
    },
    'CONNECTION_social_time_friends_p7d_grouped': {
        'No time': 0,
        'Less than 1 hour': 0.5,
        '1 to 4 hours': 2.5,
        '5 or more hours': 5
    },
    'WELLNESS_malach_pines_burnout_measure_depressed': {
        'Never': 0,
        'Almost never': 1,
        'Rarely': 2,
        'Sometimes': 3,
        'Very Often': 4,
        'Always': 5
    }
}


mapped_df = df.copy()
mapped_df = mapped_df[list(mapping_dict.keys())]

In [17]:
# Map values and rename

# Create columns for OnlineGames and SocialFriends with 'C' (text) and numeric values
mapped_df['OnlineGamesC'] = mapped_df['CONNECTION_activities_onlinegames_p3m']
mapped_df['OnlineGamesN'] = mapped_df['CONNECTION_activities_onlinegames_p3m'].map(mapping_dict['CONNECTION_activities_onlinegames_p3m'])

mapped_df['SocialFriendsC'] = mapped_df['CONNECTION_social_time_friends_p7d_grouped']
mapped_df['SocialFriendsN'] = mapped_df['CONNECTION_social_time_friends_p7d_grouped'].map(mapping_dict['CONNECTION_social_time_friends_p7d_grouped'])

# Create 'Depression' column with numeric values only
mapped_df['Depression'] = mapped_df['WELLNESS_malach_pines_burnout_measure_depressed'].map(mapping_dict['WELLNESS_malach_pines_burnout_measure_depressed'])



# Drop the original columns from mapped_df
mapped_df = mapped_df.drop(columns=list(mapping_dict.keys()))

In [18]:
# Drop NA
mapped_df = mapped_df.dropna()

In [19]:
mapped_df

Unnamed: 0,OnlineGamesC,OnlineGamesN,SocialFriendsC,SocialFriendsN,Depression
0,Not in the past three months,0.0,5 or more hours,5.0,2.0
1,Not in the past three months,0.0,5 or more hours,5.0,1.0
2,Not in the past three months,0.0,5 or more hours,5.0,1.0
3,Weekly,4.0,1 to 4 hours,2.5,2.0
4,Weekly,4.0,5 or more hours,5.0,1.0
...,...,...,...,...,...
844,Not in the past three months,0.0,1 to 4 hours,2.5,5.0
845,Not in the past three months,0.0,1 to 4 hours,2.5,4.0
846,Not in the past three months,0.0,Less than 1 hour,0.5,5.0
847,Not in the past three months,0.0,5 or more hours,5.0,3.0


In [20]:
# Reorder the categories for OnlineGamesC to make "Not in the past three months" the reference category
mapped_df['OnlineGamesC'] = pd.Categorical(mapped_df['OnlineGamesC'],
                                           categories=['Not in the past three months', 'Less than monthly', 'Monthly', 
                                                       'A few times a month', 'Weekly', 'A few times a week', 
                                                       'Daily or almost daily'],
                                           ordered=True)

# Reorder the categories for SocialFriendsC to make "No time" the reference category
mapped_df['SocialFriendsC'] = pd.Categorical(mapped_df['SocialFriendsC'],
                                            categories=['No time', 'Less than 1 hour', '1 to 4 hours', '5 or more hours'],
                                            ordered=True)


In [21]:
# Fit the OLS model using categorical values (OnlineGamesC and SocialFriendsC)
model_categorical = smf.ols("Depression ~ C(OnlineGamesC) + C(SocialFriendsC)", data=mapped_df).fit()

# Display the regression results
model_categorical.summary()


0,1,2,3
Dep. Variable:,Depression,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.034
Method:,Least Squares,F-statistic:,3.646
Date:,"Sat, 16 Nov 2024",Prob (F-statistic):,0.000183
Time:,21:55:40,Log-Likelihood:,-1166.2
No. Observations:,687,AIC:,2352.0
Df Residuals:,677,BIC:,2398.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8305,0.122,23.195,0.000,2.591,3.070
C(OnlineGamesC)[T.Less than monthly],0.2377,0.230,1.034,0.301,-0.214,0.689
C(OnlineGamesC)[T.Monthly],-0.0532,0.407,-0.131,0.896,-0.852,0.746
C(OnlineGamesC)[T.A few times a month],0.3543,0.250,1.416,0.157,-0.137,0.846
C(OnlineGamesC)[T.Weekly],0.5936,0.290,2.045,0.041,0.024,1.164
C(OnlineGamesC)[T.A few times a week],0.0816,0.297,0.275,0.783,-0.501,0.664
C(OnlineGamesC)[T.Daily or almost daily],0.0457,0.203,0.225,0.822,-0.352,0.443
C(SocialFriendsC)[T.Less than 1 hour],-0.5672,0.162,-3.495,0.001,-0.886,-0.249
C(SocialFriendsC)[T.1 to 4 hours],-0.5875,0.150,-3.912,0.000,-0.882,-0.293

0,1,2,3
Omnibus:,22.001,Durbin-Watson:,1.433
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10.683
Skew:,-0.016,Prob(JB):,0.00479
Kurtosis:,2.39,Cond. No.,8.97


In [22]:
# Fit the OLS model using numeric values (OnlineGamesN and SocialFriendsN)
model_numeric = smf.ols("Depression ~ OnlineGamesN + SocialFriendsN", data=mapped_df).fit()

# Display the regression results
model_numeric.summary()


0,1,2,3
Dep. Variable:,Depression,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.02
Method:,Least Squares,F-statistic:,7.988
Date:,"Sat, 16 Nov 2024",Prob (F-statistic):,0.000372
Time:,21:55:40,Log-Likelihood:,-1174.6
No. Observations:,687,AIC:,2355.0
Df Residuals:,684,BIC:,2369.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.5822,0.082,31.386,0.000,2.421,2.744
OnlineGamesN,0.0069,0.010,0.710,0.478,-0.012,0.026
SocialFriendsN,-0.1022,0.026,-3.914,0.000,-0.153,-0.051

0,1,2,3
Omnibus:,23.843,Durbin-Watson:,1.43
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11.689
Skew:,-0.076,Prob(JB):,0.0029
Kurtosis:,2.379,Cond. No.,9.47


In [23]:
import plotly.graph_objects as go
import numpy as np

# Create 3D scatter plot
fig = go.Figure()

# Add scatter plot for OnlineGamesN and SocialFriendsN vs Depression
fig.add_trace(go.Scatter3d(
    x=mapped_df['OnlineGamesN'], 
    y=mapped_df['SocialFriendsN'], 
    z=mapped_df['Depression'], 
    mode='markers', 
    marker=dict(color=mapped_df['Depression'], size=5, colorscale='Viridis'),
    name='Data points'
))

# Create meshgrid for OnlineGamesN (width) and SocialFriendsN (length)
x_vals = np.linspace(mapped_df['OnlineGamesN'].min(), mapped_df['OnlineGamesN'].max(), 100)
y_vals = np.linspace(mapped_df['SocialFriendsN'].min(), mapped_df['SocialFriendsN'].max(), 100)
x_grid, y_grid = np.meshgrid(x_vals, y_vals)

# Predicted values using the numeric model
z_grid = model_numeric.predict(pd.DataFrame({'OnlineGamesN': x_grid.flatten(), 'SocialFriendsN': y_grid.flatten()}))

# Reshape the predicted values back to the meshgrid shape
z_grid = z_grid.values.reshape(x_grid.shape)

# Add the regression surface to the plot
fig.add_trace(go.Surface(
    x=x_grid, 
    y=y_grid, 
    z=z_grid, 
    opacity=0.5, 
    colorscale='Blues', 
    showscale=False, 
    name='Regression Surface'
))

# Add axis labels and title
fig.update_layout(
    title="3D Visualization of Depression vs OnlineGamesN & SocialFriendsN",
    scene=dict(
        xaxis_title='OnlineGamesN (Width)',
        yaxis_title='SocialFriendsN (Length)',
        zaxis_title='Depression'
    ),
    showlegend=True,
    width=1200,
    height=800
)

fig.show()