In [None]:
#import libraires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import preprocessing

In [None]:
#read csv file and concat
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df=pd.concat([train,test],axis=0)
# Drop the auto-generated index column
df.drop(['Unnamed: 0','id'], axis=1,inplace=True)

In [None]:
#Check dataset
print("The Shape:",df.shape)
print("The NULL values:\n",df.isnull().sum())
print("Number of duplicated values",df.duplicated().sum())

In [None]:
# drop null values
df.dropna(inplace=True)

In [None]:
#descriptive statistics
df.describe()

In [None]:
#summary of a DataFrame
df.info()

In [None]:
# satisfaction
plt.figure(figsize=(10,8), facecolor='azure')
plt.title('Satisfaction')
sns.countplot(x='satisfaction', data=df)
plt.tight_layout()
plt.show()

In [None]:
# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])

In [None]:
# apply Label encoder to df_categorical
enc= preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(enc.fit_transform)
df.drop(df_categorical.columns, axis=1, inplace=True)
df = pd.concat([df, df_categorical], axis=1)

In [None]:
#list of columns in the dataset
df.columns.to_list

In [None]:
# Define features and target variable
X=df.drop('satisfaction',axis=1)
y=df['satisfaction']

In [None]:
# Split data into training and testing sets (70% train, 30% test)
X_train,X_test,y_train,y_test= train_test_split(X, y ,test_size=.3, random_state=42)

In [None]:
# 1. Logistic Regression
log=LogisticRegression(max_iter=1000, random_state=42)
# Train on training set
log.fit(X_train,y_train)

In [None]:
# Predict on test set
log_pred=log.predict(X_test)

In [None]:
# Calculate performance metrics for Logistic Regression
print("Logistic Regression Matrices:")
print("Random Forest Classifier Matrices:")
print("Accuracy Score:",round(metrics.accuracy_score(y_test,log_pred),3))
print('Precission_score:',round(metrics.precision_score(y_test,log_pred),3))
print('Recall score:',round(metrics.recall_score(y_test,log_pred),3))
print('F1-score:',round(metrics.f1_score(y_test,log_pred),3))

In [None]:
# 2. Random Forest Classifier
rand = RandomForestClassifier(random_state=42)
# Train on training set
rand.fit(X_train, y_train)

In [None]:
# Predict on test set
rand_pred = rand.predict(X_test)

In [None]:
# Calculate performance metrics for Random Forest Classifier
print("Random Forest Classifier Matrices:")
print("Accuracy Score:",round(metrics.accuracy_score(y_test,rand_pred),3))
print('Precission_score:',round(metrics.precision_score(y_test,rand_pred),3))
print('Recall score:',round(metrics.recall_score(y_test,rand_pred),3))
print('F1-score:',round(metrics.f1_score(y_test,rand_pred),3))

In [None]:
# Confusion Matrices
log_cm=metrics.confusion_matrix(y_test,log_pred)
rand_cm=metrics.confusion_matrix(y_test,rand_pred)
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(12,5),facecolor='azure')
sns.heatmap(log_cm, annot=True, fmt='d', ax=ax1, cmap='Blues')
ax1.set_title('Logistic Regression Confusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
sns.heatmap(rand_cm, annot=True, fmt='d', ax=ax2, cmap='Blues',cbar=False)
ax2.set_title('Random Forest Classifier Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
plt.tight_layout()
plt.show()

In [None]:
# visualize correlations between all features
plt.figure(figsize=(15,10))
corr = pd.concat([X_train, y_train], axis=1).corr().round(2)
sns.heatmap(corr, annot=True)
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()