# **Project: Airfare Price Prediction**

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Read the data
train_set = pd.read_csv("train_data.csv", index_col=0)
test_set = pd.read_csv("test_data.csv", index_col=0)

## Data Analysis

In [3]:
# Prints the shape of the train_data and test_data datasets.
# print(train_set.shape,  test_set.shape)

# Displays the first few rows of the train_data and test_data datasets.
# train_set.head()
# test_set.head()

# Calculates and displays the number of missing values in each column of the train_data and test_data datasets.
# train_set.isnull().sum()
# test_set.isnull().sum()

# Calculates and displays the number of missing values in each column of the test_data and train_data datasets.
# train_set.info()
# test_set.info()

# Displays descriptive statistics of the train_data and test_data datasets
# train_set.describe()
# test_set.describe()

## Data Processing

In [4]:
# Preprocess train data
X_train = train_set.drop('price', axis=1)
y_train = train_set.price
X_train['stops'] = X_train['stops'].map({'zero': 0, 'one': 1, 'two_or_more': 2})
X_train['class'] = X_train['class'].map({'Economy': 0, 'Business': 1})

# Preprocess test data
test_set['stops'] = test_set['stops'].map({'zero': 0, 'one': 1, 'two_or_more': 2})
test_set['class'] = test_set['class'].map({'Economy': 0, 'Business': 1})

# Create a pipeline
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=[object]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Model Training

In [5]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100))])

# Fit the model using cross-validation
cross_tree = cross_val_predict(model, X_train, y_train, cv=5)

# Calculate and print the Mean Absolute Error (MAE), Root Mean Squared Error (RMSE) and R-Squared
print(f'MAE: {mean_absolute_error(y_train, cross_tree):.2f}')
print(f'RMSE: {mean_squared_error(y_train, cross_tree, squared=False):.2f}')
print(f'R-squared: {r2_score(y_train, cross_tree):.2f}')

MAE: 1621.81
RMSE: 3436.89
R-squared: 0.98


In [6]:
# Fit the model on the entire training data
model.fit(X_train, y_train)

# Preprocess test data using the same pipeline
test_scaled = model['preprocessor'].transform(test_set)

# Make predictions on the test data
pred = model['regressor'].predict(test_scaled)

In [7]:
# Create the sample solution DataFrame
sample_solution = pd.read_csv('sample_solution.csv', index_col=0)
sample_solution.price = pred
sample_solution.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
1,53312.39
2,62256.96
3,23319.24
4,2359.37
5,5643.37


In [8]:
# Save the predictions to a CSV file
sample_solution.to_csv('AAP_Solutions_V1.csv')