<a href="https://colab.research.google.com/github/SANTHOSH-SR8245/AIML/blob/main/3_Flight_Price_Prediction_Feature_Selection_k_Best.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

In [3]:
# Read the preprocessed CSV file
dataset = pd.read_csv('flightprice.csv')

# Inspect the DataFrame
print(dataset.head())

   duration  days_left   airline source_city departure_time  stops  \
0       223          4    IndiGo     Kolkata      Afternoon      0   
1       249         29     GoAir       Delhi          Night      0   
2       119         17     GoAir       Delhi      Afternoon      0   
3       131         26    IndiGo      Mumbai        Evening      0   
4        86          3  SpiceJet       Delhi        Evening      0   

  arrival_time destination_city     class  price  
0    Afternoon        Bangalore   Economy  14087  
1      Morning          Kolkata   Economy   6582  
2        Night          Kolkata  Business  12654  
3      Evening        Hyderabad   Economy   8514  
4      Evening          Chennai  Business  11785  


In [4]:
dataset.columns

Index(['duration', 'days_left', 'airline', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'price'],
      dtype='object')

In [5]:
indep_X=dataset[['duration', 'days_left', 'airline', 'source_city',
       'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']]

In [6]:
dep_Y=dataset[['price']]

# Feature Selection

# Recursive Feature Elimination

In [7]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        return X_train, X_test, y_train, y_test

In [8]:
def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [9]:
def Linear(X_train,y_train,X_test):
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [10]:
def Decision(X_train,y_train,X_test):
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [11]:
def random(X_train,y_train,X_test):
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [12]:
def xgboost(X_train,y_train,X_test):
        from xgboost import XGBRegressor
        regressor = XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return r2

In [39]:
def selectKBestFeature(indep_X, dep_Y, k):
    kbestlist = []
    colnames_list = []
    # List to store column names for each model
    r2_values = []
    # List to store R2 values for each model

    # Perform one-hot encoding on independent variables
    indep_X_encoded = pd.get_dummies(indep_X, columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'], drop_first=True)

    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_regression # Suitable for regression tasks

    # SelectKBest does not work with model estimators directly like RFE.
    # We will use f_regression as the score function for SelectKBest

    # Apply SelectKBest
    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(indep_X_encoded, dep_Y)
    kbestlist.append(X_new)

    # Get the column names selected by SelectKBest
    selected_columns = [col for col, selected in zip(indep_X_encoded.columns, selector.get_support()) if selected]
    colnames_list.append(selected_columns)

    # Since SelectKBest is not tied to a specific model for selection,
    # we can evaluate the selected features with different models.
    # However, the original request was to alter the existing function which
    # iterated through models with RFE. To maintain a similar structure
    # for comparison, we will fit and evaluate each model using the features
    # selected by SelectKBest.

    from sklearn.linear_model import LinearRegression
    lin = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dec = DecisionTreeRegressor(random_state=0)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=10, random_state=0)

    from xgboost import XGBRegressor
    xgb = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)

    modellist = [lin, dec, rf, xgb]
    model_names = ["Linear", "Decision", "Random", "XGBoost"]

    for model, model_name in zip(modellist, model_names):
        # Split and scale the data with selected features
        X_train, X_test, y_train, y_test = split_scalar(pd.DataFrame(X_new), dep_Y)

        # Fit the model and calculate and store the R2 value
        model.fit(X_train, y_train)
        r2 = r2_prediction(model, X_test, y_test)
        r2_values.append(r2)

    return kbestlist, colnames_list, r2_values

# Call the function with your data
kbestlist, colnames_list, r2_values = selectKBestFeature(indep_X, dep_Y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["SelectKBest Features evaluated with Linear", "SelectKBest Features evaluated with Decision", "SelectKBest Features evaluated with Random", "SelectKBest Features evaluated with XGBoost"], colnames_list*4, r2_values): # Repeat colnames_list for each model evaluation
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

Model: SelectKBest Features evaluated with Linear
Selected Columns: ['duration', 'airline_SpiceJet', 'source_city_Hyderabad', 'departure_time_Morning', 'destination_city_Delhi']
R2 Value: 0.19045946649641632

Model: SelectKBest Features evaluated with Decision
Selected Columns: ['duration', 'airline_SpiceJet', 'source_city_Hyderabad', 'departure_time_Morning', 'destination_city_Delhi']
R2 Value: -0.8432479077689177

Model: SelectKBest Features evaluated with Random
Selected Columns: ['duration', 'airline_SpiceJet', 'source_city_Hyderabad', 'departure_time_Morning', 'destination_city_Delhi']
R2 Value: -0.14679296223199234

Model: SelectKBest Features evaluated with XGBoost
Selected Columns: ['duration', 'airline_SpiceJet', 'source_city_Hyderabad', 'departure_time_Morning', 'destination_city_Delhi']
R2 Value: -0.5515284538269043



  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


# Model Creation

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)

# Perform one-hot encoding on independent variables
x_train_encoded = pd.get_dummies(x_train, columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'], drop_first=True)
x_test_encoded = pd.get_dummies(x_test, columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'], drop_first=True)

from sklearn.tree import DecisionTreeRegressor
regressor_dt=DecisionTreeRegressor(criterion='squared_error', splitter='random')
regressor_dt=regressor_dt.fit(x_train_encoded,y_train)

In [21]:
y_pred=regressor_dt.predict(x_test)

In [22]:
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)

In [23]:
r_score

-0.5649564717977766

In [26]:
from sklearn.ensemble import GradientBoostingRegressor
regressor_gbr = GradientBoostingRegressor(n_estimators=500,max_depth=4,min_samples_split=5,learning_rate=0.01,loss="squared_error")
regressor_gbr.fit(x_train_encoded, y_train)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


In [28]:
y_pred=regressor_gbr.predict(x_test_encoded)

In [None]:
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
r_score

0.9541339069207602

In [31]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators=100,random_state=0)
regressor_rf.fit(x_train_encoded, y_train)

  return fit_method(estimator, *args, **kwargs)


In [35]:
y_pred=regressor_rf.predict(x_test_encoded)

In [34]:
r_score

-0.5649564717977766

In [36]:
# When comparing, Decision Tree algorithm gives maximum accuracy of 97%

In [37]:
import pickle
Finalised_Model="Finalized_model.sav"

In [38]:
pickle.dump(regressor_dt,open(Finalised_Model,'wb'))