In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics

In [57]:
train_df = pd.read_csv('./train_data.csv', index_col=0)
test_df = pd.read_csv('./test_data.csv', index_col=0)
train_df.head(10)

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712
6,Air_India,AI-636,Delhi,Afternoon,one,Morning,Chennai,Business,20.42,5,45257
7,Vistara,UK-834,Chennai,Evening,one,Morning,Mumbai,Economy,18.42,20,5054
8,Air_India,AI-505,Bangalore,Morning,zero,Afternoon,Delhi,Business,2.58,30,32923
9,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Hyderabad,Economy,15.92,12,11383
10,Air_India,AI-538,Chennai,Night,one,Afternoon,Mumbai,Economy,16.5,18,4357


In [58]:
X = train_df.drop('price', axis=1)
y = train_df['price']

In [59]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 1 to 20000
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           20000 non-null  object 
 1   flight            20000 non-null  object 
 2   source_city       20000 non-null  object 
 3   departure_time    20000 non-null  object 
 4   stops             20000 non-null  object 
 5   arrival_time      20000 non-null  object 
 6   destination_city  20000 non-null  object 
 7   class             20000 non-null  object 
 8   duration          20000 non-null  float64
 9   days_left         20000 non-null  int64  
dtypes: float64(1), int64(1), object(8)
memory usage: 1.7+ MB


In [60]:
cat_attributes = X.select_dtypes(include='object').columns.to_list()
num_attributes = X.select_dtypes(include=['float64','int64']).columns.to_list()

cat_attributes

['airline',
 'flight',
 'source_city',
 'departure_time',
 'stops',
 'arrival_time',
 'destination_city',
 'class']

In [61]:
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('numeric', num_pipeline, num_attributes),
    ('categorical', cat_pipeline, cat_attributes)
])

In [62]:
X_prepared = preprocessor.fit_transform(X)

In [65]:
X_prepared

<20000x1347 sparse matrix of type '<class 'numpy.float64'>'
	with 200000 stored elements in Compressed Sparse Row format>

In [66]:
# split to train and test
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42)

In [67]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [68]:
coefficients = lr_model.coef_
theta0 = lr_model.intercept_

print('coefficients:', coefficients)
print('theta0:', theta0)

coefficients: [   469.0562266   -1788.74984618  -1278.93151118 ...   -295.94535922
  22509.13527324 -22509.13527393]
theta0: 27935.287078587404


In [69]:
y_pred = lr_model.predict(X_test)

In [70]:
MAE = metrics.mean_absolute_error(y_test, y_pred)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print(f"MAE: {MAE:.2f}")
print(f"RMSE: {RMSE:.2f}")


MAE: 4376.56
RMSE: 6334.64


In [71]:
# MLPR
mlpr_model = MLPRegressor(max_iter=2000)
mlpr_model.fit(X_train, y_train)



In [72]:
mlpr_model_pred = mlpr_model.predict(X_test)

In [74]:
MAE = metrics.mean_absolute_error(y_test, mlpr_model_pred)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, mlpr_model_pred))

print(f"MAE: {MAE:.2f}")
print(f"RMSE: {RMSE:.2f}")


MAE: 3708.95
RMSE: 5640.85


In [75]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(max_depth=15)
rf_model.fit(X_train, y_train)
rf_model_pred = rf_model.predict(X_test)

MAE = metrics.mean_absolute_error(y_test, rf_model_pred)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, rf_model_pred))

print(f"MAE: {MAE:.2f}")
print(f"RMSE: {RMSE:.2f}")

MAE: 2012.19
RMSE: 3748.51


### with Linear regression:
- MAE: 4667.19
- RMSE: 7048.21

### with MLPRegressor:
- MAE: 3479.03
- RMSE: 5891.41


### with RandomForestRegressor:
- MAE: 3368.19
- RMSE: 5758.30


In [27]:
# submission
train_df.shape, test_df.shape

((20000, 7), (5000, 10))

In [76]:
test_df.head()

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.0,30
4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35


In [78]:
test_df_prepared = preprocessor.transform(test_df)

y_test_predicted = rf_model.predict(test_df_prepared)

submission = pd.DataFrame({
    'id':test_df.index,
    'price':y_test_predicted
})

submission.to_csv('submission.csv', index=False)