<a href="https://colab.research.google.com/github/adityasharma10699/demopygit/blob/main/EireJet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Importing dataset and examining it
dataset = pd.read_csv("/content/drive/MyDrive/AS & ML/EireJet.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

# Converting Categorical features into Numerical features
dataset['Frequent Flyer'] = dataset['Frequent Flyer'].map({'Yes': 1, 'No': 0})
dataset['Type of Travel'] = dataset['Type of Travel'].map({'Personal Travel': 1, 'Business travel': 0})
dataset['satisfaction'] = dataset['satisfaction'].map({'satisfied':1, 'neutral or dissatisfied':0})
dataset['Gender'] = dataset['Gender'].map({'Male': 0, 'Female': 1})
dataset['Class'] = dataset['Class'].map({'Eco': 0, 'Eco Plus': 1, 'Business': 2})
print(dataset.info())

# categorical_features = ['Department', 'EducationField', 'JobRole', 'MaritalStatus']
# final_data = pd.get_dummies(dataset, columns = categorical_features)
# print(final_data.info())
# print(final_data.head(2))

# Dividing dataset into label and feature sets
X = dataset.drop(['satisfaction','Arrival Delay in Minutes'], axis = 1) # Features
Y = dataset['satisfaction'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)
print(X)

# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Implementing Random Forest Classifier
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [100,120,130,140,150,200]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='precision', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

featimp = pd.Series(gd_sr.best_estimator_.named_steps["classification"].feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)

# # Selecting features with higher sifnificance and redefining feature set
# X_ = final_data[['OverTime','StockOptionLevel','MonthlyIncome','JobSatisfaction']]

# feature_scaler = StandardScaler()
# X_scaled_ = feature_scaler.fit_transform(X_)

# #Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
# model = Pipeline([
#         ('balancing', SMOTE(random_state = 101)),
#         ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
#     ])
# grid_param = {'classification__n_estimators': [200,250,300,350,400]}

# gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

# """
# In the above GridSearchCV(), scoring parameter should be set as follows:
# scoring = 'accuracy' when you want to maximize prediction accuracy
# scoring = 'recall' when you want to minimize false negatives
# scoring = 'precision' when you want to minimize false positives
# scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
# """

# gd_sr.fit(X_scaled_, Y)

# best_parameters = gd_sr.best_params_
# print(best_parameters)

# best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
# print(best_result)

   Gender Frequent Flyer  Age   Type of Travel     Class  Flight Distance  \
0    Male            Yes   13  Personal Travel  Eco Plus              460   
1    Male             No   25  Business travel  Business              235   
2  Female            Yes   26  Business travel  Business             1142   
3  Female            Yes   25  Business travel  Business              562   
4    Male            Yes   61  Business travel  Business              214   

   Inflight wifi service  Departure/Arrival time convenient  \
0                      3                                  4   
1                      3                                  2   
2                      2                                  2   
3                      2                                  5   
4                      3                                  3   

   Ease of Online booking  Gate location  Food and drink  Online boarding  \
0                       3              1               5                3   
1   