In [9]:
# Initial imports.
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [10]:
df = pd.read_csv(Path('./Resources/ftusa_data_ml.csv'))
df.head()

Unnamed: 0,gender_core,age_core,understand_contract,understand_contract.1,total_tenure_months,workplace_satis_core,living_wage_tf_core,conditions_rate_core,safety_core,accident_last_year,covid_safety_measures_core,return_next_season_core,pressure_to_work_core,factory_recommend_tf_core
0,0,30,1,0,132,3,0,3,4,0,4,1,0,1
1,1,30,1,0,108,3,0,2,4,0,4,1,0,1
2,0,24,1,0,67,4,1,3,4,0,4,1,0,1
3,0,58,1,0,79,4,1,3,4,0,4,1,0,0
4,0,38,1,0,108,4,1,2,4,0,4,1,0,1


In [11]:
df.count()

gender_core                   3612
age_core                      3612
understand_contract           3612
understand_contract.1         3612
total_tenure_months           3612
workplace_satis_core          3612
living_wage_tf_core           3612
conditions_rate_core          3612
safety_core                   3612
accident_last_year            3612
covid_safety_measures_core    3612
return_next_season_core       3612
pressure_to_work_core         3612
factory_recommend_tf_core     3612
dtype: int64

In [12]:
# Define the features set.
X = df.copy()
X = X.drop("factory_recommend_tf_core", axis=1)
X.head()

Unnamed: 0,gender_core,age_core,understand_contract,understand_contract.1,total_tenure_months,workplace_satis_core,living_wage_tf_core,conditions_rate_core,safety_core,accident_last_year,covid_safety_measures_core,return_next_season_core,pressure_to_work_core
0,0,30,1,0,132,3,0,3,4,0,4,1,0
1,1,30,1,0,108,3,0,2,4,0,4,1,0
2,0,24,1,0,67,4,1,3,4,0,4,1,0
3,0,58,1,0,79,4,1,3,4,0,4,1,0
4,0,38,1,0,108,4,1,2,4,0,4,1,0


In [13]:
# Define the target set.
y = df["factory_recommend_tf_core"].ravel()
y[:5]

array([1, 1, 1, 0, 1])

In [14]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [15]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [17]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [18]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [19]:
predictions

array([1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 1, 2, 1, 1,
       1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 0,
       2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 0, 2, 1, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 0, 0, 2, 2, 1, 2, 2, 1, 1, 1, 2,
       2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 0, 2, 2,
       2, 0, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 0, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2,
       1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       0, 2, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2,

In [25]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Demoter", "Actual Neutral", "Actual Promoter"], columns=["Predicted Demoter", "Predicted Neutral","Predicted Promoter"])

cm_df

Unnamed: 0,Predicted Demoter,Predicted Neutral,Predicted Promoter
Actual Demoter,27,40,31
Actual Neutral,18,87,140
Actual Promoter,11,97,452


In [26]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)


In [27]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted Demoter,Predicted Neutral,Predicted Promoter
Actual Demoter,27,40,31
Actual Neutral,18,87,140
Actual Promoter,11,97,452


Accuracy Score : 0.6267995570321152
Classification Report
              precision    recall  f1-score   support

           0       0.48      0.28      0.35        98
           1       0.39      0.36      0.37       245
           2       0.73      0.81      0.76       560

    accuracy                           0.63       903
   macro avg       0.53      0.48      0.50       903
weighted avg       0.61      0.63      0.61       903



In [28]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.03946073, 0.24774882, 0.01180007, 0.01207199, 0.24386261,
       0.07318193, 0.03518376, 0.09129115, 0.06049912, 0.02011921,
       0.0827321 , 0.0165372 , 0.06551131])

In [29]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.2477488193622353, 'age_core'),
 (0.24386261437789597, 'total_tenure_months'),
 (0.09129115042222219, 'conditions_rate_core'),
 (0.08273210077421381, 'covid_safety_measures_core'),
 (0.07318192656330712, 'workplace_satis_core'),
 (0.06551131104556596, 'pressure_to_work_core'),
 (0.06049911685582883, 'safety_core'),
 (0.039460734758302154, 'gender_core'),
 (0.03518375856091194, 'living_wage_tf_core'),
 (0.02011921464085408, 'accident_last_year'),
 (0.016537195730693115, 'return_next_season_core'),
 (0.012071985391112188, 'understand_contract.1'),
 (0.011800071516857231, 'understand_contract')]