In [56]:
#IMPORT LIBRARIES
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import joblib

In [57]:
#Import the data
df=pd.read_csv("datasets/cleandf.csv")

In [58]:
pd.set_option('display.max_columns', None)

## FLIGHT RECOMMENDER

In [59]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

In [60]:
df.columns

Index(['Unnamed: 0', 'AIRLINENAME', 'CABINTYPE', 'DATEFLOWN', 'DATEPUB',
       'ENTERTAINMENTRATING', 'FOODRATING', 'GROUNDSERVICERATING',
       'ORIGINCOUNTRY', 'OVERALLSCORE', 'RECOMMENDED', 'REVIEW', 'ROUTE',
       'ORIGIN', 'DESTINY', 'SCALE_YN', 'SEATCOMFORTRATING', 'SERVICERATING',
       'SLUG', 'TITLE', 'TRAVELLER', 'PURPOSE', 'TRIPVERIFIED', 'VALUERATING',
       'WIFIRATING', 'UNIQUE_ID', 'LONGDISTANCE', 'YEAR'],
      dtype='object')

In [61]:
#take the scores
data=df[["OVERALLSCORE","ENTERTAINMENTRATING", "FOODRATING", "GROUNDSERVICERATING", "SEATCOMFORTRATING", "SERVICERATING", "VALUERATING", "WIFIRATING"]]

In [62]:
data.shape

(90008, 8)

In [63]:
#create the dependent and independent variables
X=data.drop(columns="OVERALLSCORE")
y=data.OVERALLSCORE

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#HP tunning and training of the model
'''from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Definir el grid de parámetros
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instanciar el modelo
rf = RandomForestRegressor()

# Configurar el GridSearch
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)

# Ajustar el modelo
grid_search_rf.fit(X_train, y_train)

# Resultados
print(f"Best Parameters: {grid_search_rf.best_params_}")
print(f"Best Score: {grid_search_rf.best_score_}")'''


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Score: 0.9095684249802594


In [65]:
from sklearn.ensemble import RandomForestRegressor

In [66]:
best_params={'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
modelrf=RandomForestRegressor(**best_params) 
modelrf.fit(X_train, y_train)

In [74]:
#save the model
joblib.dump(modelrf, "models/random_forest_class.joblib")

['models/random_forest_class.joblib']

In [75]:
#load the saved model
loaded_rf = joblib.load("models/random_forest_class.joblib")

In [76]:
#data=df[["ENTERTAINMENTRATING", "FOODRATING", "GROUNDSERVICERATING", "SEATCOMFORTRATING", "SERVICERATING", "VALUERATING", "WIFIRATING"]]

In [77]:
X.sample()

Unnamed: 0,ENTERTAINMENTRATING,FOODRATING,GROUNDSERVICERATING,SEATCOMFORTRATING,SERVICERATING,VALUERATING,WIFIRATING
72787,0,0,1,0,0,1,0


In [78]:
df_forclassif=X.to_csv("datasets/streamlitdb/df_forclassif.csv")

In [79]:
my_flight_values = [3,3,5,5,1,4,2]

In [80]:
df.OVERALLSCORE.value_counts()

OVERALLSCORE
1     38934
2      9341
10     8966
9      7365
8      6742
3      5519
7      4432
4      3087
5      2950
6      2672
Name: count, dtype: int64

In [81]:
#dataframe with new included values
my_flight_df = pd.DataFrame(data=[my_flight_values], columns=X.columns)

# Make prediction
predicted_score = round(loaded_rf.predict(my_flight_df)[0])
print(f"The Overall Score of this airline is {predicted_score} for you!!!")

# Calculate the average overall score for each airline
df1 = df.groupby(['AIRLINENAME'])['OVERALLSCORE'].mean().reset_index()

# Check if the predicted score matches any overall score in the dataset (and give the first 5 with similar score)
if predicted_score in df['OVERALLSCORE'].values:
    df2=df1[df1.OVERALLSCORE==predicted_score]
    df2=df2.sort_values(by="OVERALLSCORE", ascending=False)
    print(df2.head())

The Overall Score of this airline is 8 for you!!!
                  AIRLINENAME  OVERALLSCORE
30                  Air Costa           8.0
105             Badr Airlines           8.0
189             Felix Airways           8.0
239                      Jazz           8.0
297  Mann Yadanarpon Airlines           8.0


### Recommendation System

In [82]:
len(df.AIRLINENAME.unique())

514

In [83]:
df.sample()

Unnamed: 0.1,Unnamed: 0,AIRLINENAME,CABINTYPE,DATEFLOWN,DATEPUB,ENTERTAINMENTRATING,FOODRATING,GROUNDSERVICERATING,ORIGINCOUNTRY,OVERALLSCORE,RECOMMENDED,REVIEW,ROUTE,ORIGIN,DESTINY,SCALE_YN,SEATCOMFORTRATING,SERVICERATING,SLUG,TITLE,TRAVELLER,PURPOSE,TRIPVERIFIED,VALUERATING,WIFIRATING,UNIQUE_ID,LONGDISTANCE,YEAR
59978,60189,Ryanair,1,2019-11-01,18th November 2019,0,0,1,UnitedKingdom,1,0,Bologna to Stansted. When I did the check in f...,Bologna to Stansted,Bologna,Stansted,0,1,2,ryanair,have to pay one more time,2,1,1,2,0,ae7602c2-b08c-4f0e-adbb-909c93089a37,0,2019


In [84]:
#use the mean values for each airline
recommendator=df.groupby("AIRLINENAME").mean(numeric_only=True)

In [85]:
recommendator=recommendator.round(2)

In [86]:
num=df[["AIRLINENAME", "ENTERTAINMENTRATING", "FOODRATING", "GROUNDSERVICERATING", "OVERALLSCORE", "SEATCOMFORTRATING", "SERVICERATING", "VALUERATING", "WIFIRATING"]]

In [87]:
#unnecessary variables for this analysis
recommendator.drop(columns="CABINTYPE", inplace=True)
recommendator.drop(columns="TRAVELLER", inplace=True)
recommendator.drop(columns="TRIPVERIFIED", inplace=True)
recommendator.drop(columns="SCALE_YN", inplace=True)
recommendator.drop(columns="RECOMMENDED", inplace=True)


In [88]:
#avoid duplicates in airline names
num=num.drop_duplicates(subset=['AIRLINENAME'])

In [89]:
num.reset_index(drop=True, inplace=True)

In [90]:
num.sample()

Unnamed: 0,AIRLINENAME,ENTERTAINMENTRATING,FOODRATING,GROUNDSERVICERATING,OVERALLSCORE,SEATCOMFORTRATING,SERVICERATING,VALUERATING,WIFIRATING
144,Central Mountain Air,1,1,1,1,1,1,1,1


In [91]:
num.shape

(514, 9)

In [92]:
num.drop('AIRLINENAME', axis=1, inplace=True)

In [93]:
recommendator['AIRLINE']=recommendator.index

In [94]:
recommendator.sample()

Unnamed: 0_level_0,Unnamed: 0,ENTERTAINMENTRATING,FOODRATING,GROUNDSERVICERATING,OVERALLSCORE,SEATCOMFORTRATING,SERVICERATING,PURPOSE,VALUERATING,WIFIRATING,LONGDISTANCE,YEAR,AIRLINE
AIRLINENAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Nature Air,54132.62,0.0,0.0,1.31,2.23,1.08,1.0,1.0,1.46,0.0,0.0,2017.69,Nature Air


In [95]:
num.shape

(514, 8)

In [96]:
df.reset_index(drop=True, inplace=True)
num.reset_index(drop=True, inplace=True)

In [97]:
recommendator.OVERALLSCORE.value_counts()

OVERALLSCORE
1.00     26
10.00    11
5.00     11
8.00     10
3.00      9
         ..
8.14      1
1.63      1
7.92      1
2.37      1
3.07      1
Name: count, Length: 283, dtype: int64

In [98]:
recommendator=recommendator.drop(columns="Unnamed: 0")

In [99]:
recommendator.reset_index()

Unnamed: 0,AIRLINENAME,ENTERTAINMENTRATING,FOODRATING,GROUNDSERVICERATING,OVERALLSCORE,SEATCOMFORTRATING,SERVICERATING,PURPOSE,VALUERATING,WIFIRATING,LONGDISTANCE,YEAR,AIRLINE
0,AB Aviation,0.00,2.00,2.00,3.67,2.67,2.67,1.00,2.33,0.00,0.00,2019.00,AB Aviation
1,ANA All Nippon Airways,3.34,3.86,4.26,8.14,4.01,4.50,0.89,4.21,1.17,0.85,2017.69,ANA All Nippon Airways
2,ASKY Airlines,0.59,1.35,1.59,2.29,1.82,1.65,0.76,1.35,0.24,0.35,2020.00,ASKY Airlines
3,ATA Airlines,0.00,1.50,1.50,1.50,1.00,1.50,0.50,2.00,0.00,0.00,2016.50,ATA Airlines
4,Adria Airways,0.82,1.46,3.05,4.77,3.13,3.10,0.74,2.62,0.49,0.38,2016.85,Adria Airways
...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,euroAtlantic Airways,1.33,2.00,1.67,2.13,1.40,1.93,0.93,1.47,0.67,0.87,2018.20,euroAtlantic Airways
510,fastjet,0.27,1.03,2.15,3.70,2.55,2.48,0.88,2.27,0.15,0.21,2017.21,fastjet
511,flyadeal,0.21,0.68,1.26,1.89,1.26,1.32,0.74,1.74,0.21,0.21,2020.26,flyadeal
512,flybe,0.00,2.25,3.50,5.75,2.50,3.25,1.00,3.25,0.00,0.00,2022.00,flybe


In [100]:
recommendator

Unnamed: 0_level_0,ENTERTAINMENTRATING,FOODRATING,GROUNDSERVICERATING,OVERALLSCORE,SEATCOMFORTRATING,SERVICERATING,PURPOSE,VALUERATING,WIFIRATING,LONGDISTANCE,YEAR,AIRLINE
AIRLINENAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AB Aviation,0.00,2.00,2.00,3.67,2.67,2.67,1.00,2.33,0.00,0.00,2019.00,AB Aviation
ANA All Nippon Airways,3.34,3.86,4.26,8.14,4.01,4.50,0.89,4.21,1.17,0.85,2017.69,ANA All Nippon Airways
ASKY Airlines,0.59,1.35,1.59,2.29,1.82,1.65,0.76,1.35,0.24,0.35,2020.00,ASKY Airlines
ATA Airlines,0.00,1.50,1.50,1.50,1.00,1.50,0.50,2.00,0.00,0.00,2016.50,ATA Airlines
Adria Airways,0.82,1.46,3.05,4.77,3.13,3.10,0.74,2.62,0.49,0.38,2016.85,Adria Airways
...,...,...,...,...,...,...,...,...,...,...,...,...
euroAtlantic Airways,1.33,2.00,1.67,2.13,1.40,1.93,0.93,1.47,0.67,0.87,2018.20,euroAtlantic Airways
fastjet,0.27,1.03,2.15,3.70,2.55,2.48,0.88,2.27,0.15,0.21,2017.21,fastjet
flyadeal,0.21,0.68,1.26,1.89,1.26,1.32,0.74,1.74,0.21,0.21,2020.26,flyadeal
flybe,0.00,2.25,3.50,5.75,2.50,3.25,1.00,3.25,0.00,0.00,2022.00,flybe


In [101]:
num.sample()

Unnamed: 0,ENTERTAINMENTRATING,FOODRATING,GROUNDSERVICERATING,OVERALLSCORE,SEATCOMFORTRATING,SERVICERATING,VALUERATING,WIFIRATING
319,0,0,0,1,0,0,1,0


In [102]:
recommendator.reset_index(drop=True, inplace=True)

In [103]:
#save the data
df_forrecomm=recommendator.to_csv("datasets/streamlitdb/df_forrecomm.csv")

In [104]:
#count the number of reviews for each airline
grouped_ailines=df.groupby(['AIRLINENAME'])['REVIEW'].count().reset_index()

In [105]:
grouped_ailines.sort_values('REVIEW', ascending=False)

Unnamed: 0,AIRLINENAME,REVIEW
80,American Airlines,4912
410,Spirit Airlines,3972
468,United Airlines,3721
126,British Airways,2723
200,Frontier Airlines,2700
...,...,...
209,Grand Cru Airlines,1
189,Felix Airways,1
170,EWA Air,1
168,Dynamic International Airways,1


In [106]:
#use only those with most reviews (6 airlines)
grouped_ailines = grouped_ailines[grouped_ailines['REVIEW'] > 2000] 

In [107]:
grouped_ailines.shape

(6, 2)

In [108]:
grouped_ailines.AIRLINENAME.unique()

array(['American Airlines', 'British Airways', 'Delta Air Lines',
       'Frontier Airlines', 'Spirit Airlines', 'United Airlines'],
      dtype=object)

In [109]:
recommendator.reset_index(drop=True, inplace=True)

In [110]:
#define the recommender itself:
def best_airline_recommender():
    
    user_input = input("You want to be recommended with airlines similar to: ",)
    test_data = recommendator[recommendator["AIRLINE"] == user_input]
    
    num_input = num.loc[test_data.index].values
               
# Calculate similarities
    search = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(num)
    _ , queried_indices = search.kneighbors(num_input)
                   
# Top 5 recommendations
    df_rec = recommendator.loc[queried_indices[0][1:]]
    df_rec = df_rec.sort_values(by=['OVERALLSCORE'], ascending=False)
    df_rec.index = range(1, 6)
    df_rec = df_rec[['AIRLINE','OVERALLSCORE']]
    print("Then you should try out these airlines: ")
    return df_rec

In [111]:
best_airline_recommender() #ok! 

Then you should try out these airlines: 


Unnamed: 0,AIRLINE,OVERALLSCORE
1,Mahan Air,8.33
2,Germania Airline,5.33
3,Bamboo Airways,4.88
4,Juneyao Airlines,4.23
5,Ryanair,3.72
