In [None]:
# Importing the required packages
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Firstly, Preparing training data

train_df = pd.read_csv("https://raw.githubusercontent.com/amirbek-akramov/AviaTicketsPredictorProject/main/train_data.csv")
train_df

In [None]:
# Shape of the training dataframe
train_df.shape

In [None]:
# Info about training dataframe
train_df.info()

In [None]:
# Describtion on the training dataframe
train_df.describe()

In [None]:
train_df.isnull().sum()

In [None]:
# Preparing the test data
test_df = pd.read_csv("https://raw.githubusercontent.com/amirbek-akramov/AviaTicketsPredictorProject/main/test_data.csv")
test_df

In [None]:
# Shape of the test dataframe
test_df.shape

In [None]:
# Info about test dataframe
test_df.info()

In [None]:
# Describtion on the test dataframe
test_df.describe()

In [None]:
test_df.isnull().sum()

In [None]:
pd.DataFrame(train_df.columns)

In [None]:
# Graphing the train DataFrame
fig, axes = plt.subplots(1,3, figsize=(15, 5))

# Price
sns.barplot(data=train_df, x='source_city', y='price', hue='class', palette='mako',  errorbar=None, ax=axes[0])
axes[0].set_title('Price of flight to direct city')

# Duration
sns.histplot(train_df['duration'], kde=True, bins=25, ax=axes[1])
axes[1].set_title('Duration')

# Class
train_df['class'].value_counts().plot.pie(autopct='%1.1f%%', ax=axes[2])
axes[2].set_title('Class')
axes[2].set_ylabel(" ")

In [None]:
fig, axes = plt.subplots(1,3, figsize=(15, 5))

# Stops
sns.countplot(data=train_df, x='stops', hue='class', palette='mako', ax=axes[0])
axes[0].set_title("Stops in different classes")


# Classes
sns.violinplot(x='stops', y='price', hue='class', palette='mako', gap=.1, split=True, inner="quart", data=train_df, ax=axes[1])
axes[1].set_title('Classes')

# Destination city
colors = sns.color_palette('Set2', len(train_df['destination_city'].value_counts()))
plt.pie(train_df['destination_city'].value_counts(), labels=train_df['destination_city'].value_counts().index, autopct='%1.1f%%', startangle=140, colors=colors, wedgeprops={'edgecolor': 'black'})
axes[2].set_title("Destination city")

In [None]:
train_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
str_columns =  ["airline", 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

# LabelEncoder
label_encoder = LabelEncoder()


for column in str_columns:
    train_df[column] = label_encoder.fit_transform(train_df[column])

train_df

In [None]:
# LabelEncoder
label_encoder = LabelEncoder()


for column in str_columns:
    test_df[column] = label_encoder.fit_transform(test_df[column])

test_df

In [None]:
train_df.drop(columns='id')
test_df.drop(columns='id')

In [None]:
# Splitting the data
X = train_df.drop(columns='price')
y = train_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Training model
RFR = RandomForestRegressor()
RFC = RandomForestClassifier()

In [None]:
RFR.fit(X_train, y_train)
RFC.fit(X_train, y_train)

In [None]:
y_pred_clf = RFC.predict(X_test)
y_pred_rgr = RFR.predict(X_test)

In [None]:
# Evaluate the RandomForestClassifier
print("\nModel Evaluation for Regressor:")
print("RMSE:", round(metrics.mean_squared_error(y_test, y_pred_clf, squared=False)))
print("MAE:", round(metrics.mean_absolute_error(y_test, y_pred_clf)))

In [None]:
# Evaluate the RandomForestRegression
print("\nModel Evaluation for Classifier:")
print("RMSE:", round(metrics.mean_squared_error(y_test, y_pred_rgr, squared=False)))
print("MAE:", round(metrics.mean_absolute_error(y_test, y_pred_rgr)))

In [None]:
submission = pd.DataFrame({'price': y_pred_rgr})
submission['id'] = test_df['id']
submission = submission[['id', 'price']]

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
df = pd.read_csv('submission.csv')
df