In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Read the train dataset
row_train = pd.read_csv("train_dataset.csv")
# Read the test dataset
row_test = pd.read_csv('test_dataset.csv', index_col=0)

In [None]:
# Remove duplicate rows
row_train = row_train.drop_duplicates()

In [None]:
# Check value counts of the 'satisfaction' column
row_train['satisfaction'].value_counts()

In [None]:
# Fill missing values in 'Arrival Delay in Minutes' column with the median
row_train['Arrival Delay in Minutes'].fillna(row_train['Arrival Delay in Minutes'].median(), inplace=True)

In [None]:
# Map categorical columns to numeric values
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
category_mapping = {'Gender': {'Male': 1, 'Female': 2},
                    'Customer Type': {'disloyal Customer': 1, 'Loyal Customer': 2},
                    'Type of Travel': {'Business travel': 2, 'Personal Travel': 1},
                    'Class': {'Eco': 1, 'Eco Plus': 2, 'Business': 3}}

In [None]:
row_train.replace(category_mapping, inplace=True)

In [None]:
# Bin 'Departure Delay in Minutes' and 'Arrival Delay in Minutes' columns
bins = [-1, 10, np.inf]
labels = [0, -10]
row_train['Departure Delay in Minutes'] = pd.cut(row_train['Departure Delay in Minutes'], bins=bins, labels=labels).astype(int)
row_train['Arrival Delay in Minutes'] = pd.cut(row_train['Arrival Delay in Minutes'], bins=bins, labels=labels).astype(int)

In [None]:
# Prepare X and y
X = row_train.drop('satisfaction', axis=1)
y = row_train['satisfaction']

In [None]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Random Forest classifier
model_random = RandomForestClassifier(n_estimators=100)
cross_random = cross_val_predict(model_random, X_scaled, y, cv=5)

print(f"Accuracy: {accuracy_score(y, cross_random):.2f}")
print(f"Precision: {precision_score(y, cross_random):.2f}")
print(f"Recall: {recall_score(y, cross_random):.2f}")
print(f"Confusion matrix:\n{confusion_matrix(y, cross_random)}")

In [None]:
# Fit the Random Forest model on the entire dataset
model_random.fit(X_scaled, y)

# Fill missing values in 'Arrival Delay in Minutes' column with the median
row_test['Arrival Delay in Minutes'].fillna(row_test['Arrival Delay in Minutes'].median(), inplace=True)

# Map categorical columns to numeric values
row_test.replace(category_mapping, inplace=True)

# Bin 'Departure Delay in Minutes' and 'Arrival Delay in Minutes' columns
row_test['Departure Delay in Minutes'] = pd.cut(row_test['Departure Delay in Minutes'], bins=bins, labels=labels).astype(int)
row_test['Arrival Delay in Minutes'] = pd.cut(row_test['Arrival Delay in Minutes'], bins=bins, labels=labels).astype(int)

# Scale the test dataset
test_scaled = scaler.transform(row_test)

# Make predictions on the test dataset
prediction = model_random.predict(test_scaled)
print(prediction)

In [None]:
# Read the sample submission CSV file into a pandas DataFrame, using the first column as the index
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [None]:
# Assign the 'prediction' values to the 'satisfaction' column in the sample_submission DataFrame
sample_submission.satisfaction = prediction

# Save the updated sample_submission DataFrame to a new CSV file called 'PAPS_Solutions.csv'
sample_submission.to_csv('PAPS_Solutions.csv')