In [26]:
import numpy as np
import pandas as pd

In [30]:
df = pd.read_csv('transformed_data.csv')
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,day,month,Duration_min
0,3,0,7,0,8,3897.0,24,3,170
1,1,5,0,2,8,7662.0,1,5,445
2,4,3,2,2,8,13882.0,9,6,1140
3,3,5,0,1,8,6218.0,12,5,325
4,3,0,7,1,8,13302.0,1,3,285


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10462 entries, 0 to 10461
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          10462 non-null  int64  
 1   Source           10462 non-null  int64  
 2   Destination      10462 non-null  int64  
 3   Total_Stops      10462 non-null  int64  
 4   Additional_Info  10462 non-null  int64  
 5   Price            10462 non-null  float64
 6   day              10462 non-null  int64  
 7   month            10462 non-null  int64  
 8   Duration_min     10462 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 735.7 KB


In [34]:
from sklearn.model_selection import train_test_split

X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(8369, 8) (8369,) (2093, 8) (2093,)


In [51]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [40]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [52]:
models = [
    ['LinearRegression : ', LinearRegression()],
    ['Lasso : ', Lasso()],
    ['Ridge : ', Ridge()],
    ['ElasticNet :', ElasticNet()],
    ['KNeighborsRegressor : ', KNeighborsRegressor()],
    ['DecisionTreeRegressor : ', DecisionTreeRegressor()],
    ['RandomForestRegressor : ', RandomForestRegressor()],
    ['GradientBoostingRegressor : ', GradientBoostingRegressor()],
    ['XGBRegressor : ', XGBRegressor()],
    ['SVR : ', SVR()],
    ['AdaBoostRegressor : ', AdaBoostRegressor()]
]

In [53]:
for name, model in models:
    model=model
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name, (np.sqrt(r2_score(y_test, predictions))))

LinearRegression :  0.6633862466699987
Lasso :  0.663387586655805
Ridge :  0.6633890042293565
ElasticNet : 0.6075393258479974
KNeighborsRegressor :  0.7573152210707371
DecisionTreeRegressor :  0.8694715942992624
RandomForestRegressor :  0.9137567672365122
GradientBoostingRegressor :  0.895835960621256
XGBRegressor :  0.924780056687072
SVR :  0.40858493968202075
AdaBoostRegressor :  0.7135097574327856


In [55]:
algorithms = {
    'XGBRegressor' : {
        'model' : XGBRegressor(),
        'param' : {
            'learning_rate' : [0.1, 0.3, 0.5, 0.25],
            'max_depth' : [3, 5, 7],
            'gamma' : [0.1, 0.3, 0.5],
            'min_child_weight' : [3, 5, 7, 9],
        }
    },
    'RandomForestRegressor' : {
        'model' : RandomForestRegressor(),
        'param' : {
            'n_estimators' : [1000, 300, 500],
            'max_depth' : [9, 5, 7],
            'min_samples_split' : [2, 4, 6, 8]
        }
    },
    'GradientBoostingRegressor' : {
        'model' : GradientBoostingRegressor(),
        'param' : {
            'learning_rate' : [0.5, 0.1, 0.20, 0.30],
            'n_estimators' : [300, 500, 1000],
        }
    }
}

from sklearn.model_selection import GridSearchCV

In [56]:
score = []

for name, mp in algorithms.items() :
    rs = GridSearchCV(estimator = mp['model'], param_grid = mp['param'], cv = 10, n_jobs=-1, verbose=3)
    rs.fit(X_train, y_train)
    score.append({
        'model': name,
        'score' : rs.best_score_,
        'params' : rs.best_params_
    })

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits


In [57]:
final = pd.DataFrame(score, columns=['model', 'score', 'params'])
final

Unnamed: 0,model,score,params
0,XGBRegressor,0.858842,"{'gamma': 0.1, 'learning_rate': 0.3, 'max_dept..."
1,RandomForestRegressor,0.815423,"{'max_depth': 9, 'min_samples_split': 4, 'n_es..."
2,GradientBoostingRegressor,0.851813,"{'learning_rate': 0.3, 'n_estimators': 1000}"


In [61]:
for i in final.iloc[0, :]:
    print(i)

XGBRegressor
0.8588417530089341
{'gamma': 0.1, 'learning_rate': 0.3, 'max_depth': 7, 'min_child_weight': 9}
