In [1]:
# Initial imports.
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
df = pd.read_csv(Path('./Resources/test_data v2.csv'))
df.head()

Unnamed: 0,conditions_rate_core,workplace_satis_core,factory_recommend_tf_core
0,4,4,2
1,3,4,2
2,4,5,2
3,4,5,1
4,3,5,2


In [3]:
df.count()

conditions_rate_core         4517
workplace_satis_core         4517
factory_recommend_tf_core    4517
dtype: int64

In [4]:
# Define the features set.
X = df.copy()
X = X.drop("factory_recommend_tf_core", axis=1)
X.head()

Unnamed: 0,conditions_rate_core,workplace_satis_core
0,4,4
1,3,4
2,4,5
3,4,5
4,3,5


In [5]:
# Define the target set.
y = df["factory_recommend_tf_core"].ravel()
y[:5]

array([2, 2, 2, 1, 2])

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
predictions

array([2, 2, 3, ..., 2, 3, 3])

In [12]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1","Predicted 2"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,15,77,51
Actual 1,5,132,197
Actual 2,3,85,565


In [13]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)


In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,15,77,51
Actual 1,5,132,197
Actual 2,3,85,565


Accuracy Score : 0.6300884955752213
Classification Report
              precision    recall  f1-score   support

           1       0.65      0.10      0.18       143
           2       0.45      0.40      0.42       334
           3       0.69      0.87      0.77       653

    accuracy                           0.63      1130
   macro avg       0.60      0.46      0.46      1130
weighted avg       0.62      0.63      0.59      1130



In [15]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.57192737, 0.42807263])

In [16]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.5719273710608529, 'conditions_rate_core'),
 (0.42807262893914716, 'workplace_satis_core')]