In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

## Reading and Cleaning Data

In [3]:
train_data = pd.read_csv('./data/train_data.csv')
# Removing index and name columns
train_data = train_data.drop(['Unnamed: 0', 'Name'], axis=1)
train_data_y = train_data['Attrition']
train_data_x = train_data.drop(['Attrition'], axis=1)

## Converting Categorical Features to Numerical

In [4]:
# Label Encoding method is used in this regard.
categorical = train_data_x.loc[:, train_data_x.dtypes == 'object']
for column_name in categorical.columns.values:
    categorical[column_name] = categorical[column_name].astype('category')
    categorical[column_name] = categorical[column_name].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical[column_name] = categorical[column_name].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical[column_name] = categorical[column_name].cat.codes


In [5]:
# Replacing numerical values with categorical ones
train_data_x = train_data_x.drop(list(categorical.columns.values), axis=1)
final_data = pd.concat([train_data_x, categorical], axis=1)

In [6]:
train_data_y = train_data_y.astype('category')
train_data_y = train_data_y.cat.codes

## Train and Test Splitting with 10% of Data for Test

In [7]:
x_train, x_test, y_train, y_test = train_test_split(final_data, train_data_y, test_size=0.10, random_state=42)

## Training Model

In [8]:
clf = RandomForestClassifier(n_estimators=50)
clf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=50)

## Predicting Model

In [9]:
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)

In [11]:
precision, recall, F1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='macro')
accuracy = clf.score(x_train, y_train)
print(f"Precision, recall, F1-score and accuracy for the test dataset are: {round(precision,3)}, {round(recall, 3)}, {round(F1, 3)}, {round(accuracy, 3)} respectively")

Precision, recall, F1-score and accuracy for the test dataset are: 0.944, 0.577, 0.604, 0.999 respectively
