In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix



In [9]:
# Load dataset into a Pandas DataFrame
data = pd.read_csv("absenteeism_1.csv")

# Remove rows with missing values
data.dropna(inplace=True)



In [10]:
# Convert Trans_expense_cat to a numeric variable
data['Trans_expense_cat'] = data['Trans_expense_cat'].replace({'Trans_low': 0, 'Trans_Med': 1, 'Trans_High': 2})
data['Dist_to_work'] = data['Dist_to_work'].replace({'Dist_low': 0, 'Dist_Med': 1, 'Dist_High': 2})
data['Age_cat'] = data['Age_cat'].replace({'Age_Young': 0, 'Age_Middle_Age': 1, 'Age_higher': 2, 'Age_Very_young':4})
data['Abs_cat'] = data['Abs_cat'].replace({'Abs_low': 0, 'Abs_Med': 1, 'Abs_High': 2})


# Select predictors and target variable
predictors = ["Month_of_absence", "Day_of_the_week", "Social_drinker", "Social_smoker", "Pet", "Trans_expense_cat", "Dist_to_work", "Age_cat"]
target = "Abs_cat"
X=data[predictors]
y=data[target]

In [11]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [12]:
# from imblearn.over_sampling import RandomOverSampler

# # Define the oversampler
# oversampler = RandomOverSampler(random_state=42)

# # Fit and apply the oversampler to the training data
# X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)


In [13]:
# Train a new classifier on the resampled data
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None, zero_division=0)

conf_mat = confusion_matrix(y_test, y_pred)

# Print the results
print("Accuracy:", acc)
print("Precision for Abs_cat=Abs_high:", prec[2])
print("Confusion matrix:")
print(conf_mat)

Accuracy: 0.5720720720720721
Precision for Abs_cat=Abs_high: 0.4691358024691358
Confusion matrix:
[[86  4 37]
 [ 9  3  6]
 [29 10 38]]


##Random forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X_train, y_train)



In [16]:
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None, zero_division=0)
conf_mat = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("Precision for Abs_cat=Abs_high:", prec[2])
print("Confusion matrix:")
print(conf_mat)

Accuracy: 0.6126126126126126
Precision for Abs_cat=Abs_high: 0.5125
Confusion matrix:
[[92  3 32]
 [ 8  3  7]
 [29  7 41]]
