In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
cleaned_data = pd.read_csv('cleaned_data.csv')
cleaned_data.reset_index(drop=True, inplace=True)
cleaned_data.index = cleaned_data.index + 1
cleaned_data[['gender', 'family_history', 'leave', 'mental_health_consequence', 'coworkers', 'seek_help']]


Unnamed: 0,gender,family_history,leave,mental_health_consequence,coworkers,seek_help
1,Female,No,Somewhat easy,No,Some of them,Yes
2,Male,No,Don't know,Maybe,No,Don't know
3,Male,No,Somewhat difficult,No,Yes,No
4,Male,Yes,Somewhat difficult,Yes,Some of them,No
5,Male,No,Don't know,No,Some of them,Don't know
...,...,...,...,...,...,...
1247,Male,No,Somewhat easy,No,Some of them,No
1248,Male,Yes,Somewhat difficult,No,Some of them,No
1249,Male,Yes,Somewhat difficult,Yes,No,No
1250,Female,No,Don't know,Yes,No,No


### Predictive Modelling 

#### I have identified six predictors with statistically significant associations to receiving mental health treatment. The predictive modelling step involves using the insights gained from your EDA and statistical analysis to build models that can predict outcomes based on your significant predictors. I chose Logistic Regression model and Random Forest for predicting whether individuals have received treatment for mental health issues. 

#### Encode categorical variables

In [3]:
categorical_variables = ['gender', 'family_history', 'leave', 'mental_health_consequence', 'coworkers', 'seek_help']

encoded_data = pd.get_dummies(cleaned_data[categorical_variables + ['treatment']], columns = categorical_variables, drop_first = True)

print(encoded_data.head())

  treatment  gender_Male  gender_Other  family_history_Yes  \
1       Yes        False         False               False   
2        No         True         False               False   
3        No         True         False               False   
4       Yes         True         False                True   
5        No         True         False               False   

   leave_Somewhat difficult  leave_Somewhat easy  leave_Very difficult  \
1                     False                 True                 False   
2                     False                False                 False   
3                      True                False                 False   
4                      True                False                 False   
5                     False                False                 False   

   leave_Very easy  mental_health_consequence_No  \
1            False                          True   
2            False                         False   
3            False        

#### Split the Data

In [4]:
from sklearn.model_selection import train_test_split

encoded_data['treatment'] = encoded_data['treatment'].map({'Yes': 1, 'No': 0})

X = encoded_data.drop('treatment', axis = 1)  
y = encoded_data['treatment'].astype(int) 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 1000 samples
Testing set size: 251 samples


##### Logistic Regression

In [5]:
log_reg = LogisticRegression(solver = 'liblinear', random_state=42)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

In [6]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.72      0.72       131
           1       0.69      0.69      0.69       120

    accuracy                           0.71       251
   macro avg       0.70      0.70      0.70       251
weighted avg       0.71      0.71      0.71       251

Accuracy: 0.7051792828685259
Confusion Matrix:
 [[94 37]
 [37 83]]


##### My Logistic Regression model's overall accuracy is approximately 70.52%, indicating that it correctly predicts the treatment status for about 71% of the cases in the test set. A Confusion Matrix seems to have reasonable performance in determining the association between mental health and employment within the tech sector.

##### Random Forest 

In [9]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)

In [10]:
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.68      0.67       131
           1       0.63      0.61      0.62       120

    accuracy                           0.65       251
   macro avg       0.64      0.64      0.64       251
weighted avg       0.65      0.65      0.64       251

Accuracy: 0.6454183266932271
Confusion Matrix:
 [[89 42]
 [47 73]]


##### My Random Forest model's overall accuracy is approximately 65%, indicating that it correctly predicts the treatment status for about 71% of the cases in the test set.

In [11]:
feature_importances = pd.Series(random_forest.feature_importances_, index = X_train.columns).sort_values(ascending = False)
print("Feature Importances:\n", feature_importances) 

Feature Importances:
 family_history_Yes               0.261061
gender_Male                      0.110656
coworkers_Some of them           0.088065
seek_help_No                     0.075599
leave_Somewhat easy              0.074005
seek_help_Yes                    0.062243
mental_health_consequence_No     0.061489
leave_Very difficult             0.055234
mental_health_consequence_Yes    0.055013
coworkers_Yes                    0.049668
leave_Somewhat difficult         0.049344
leave_Very easy                  0.044977
gender_Other                     0.012647
dtype: float64


##### Key Insights from Feature Importances:

**Family History:**

Having a family history of mental health issues is the most influential predictor in determining whether someone seeks treatment.

**Gender:**

Being female has a notable influence.

**Work Environment:**

Comfort in discussing mental health with some or all coworkers suggests the workplace environment's openness impacts treatment-seeking behavior.

**Seek Help:**

Whether the workplace provides resources or support for mental health (and whether individuals are aware of them) influences treatment-seeking.

**Ease of Taking Leave:**

The ease or difficulty of taking leave for mental health issues plays a role, indicating that organizational policies and culture around leave can affect individuals' decisions to seek treatment.

**Perceived Mental Health Consequences:**

Perceptions of potential consequences of disclosing mental health issues or seeking treatment at work also influence treatment-seeking behavior, highlighting the impact of stigma and workplace culture.