In [1]:
import pandas as pd 
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import joblib 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from feature_engine.datetime import DatetimeFeatures


from sklearn.svm import SVR

from xgboost import XGBRegressor

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ridge_regression
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import learning_curve

In [3]:
pd.set_option("display.max_columns",None)

In [4]:
sklearn.set_config(transform_output="default")

In [5]:
train_df=pd.read_csv('data/train.csv')
val_df=pd.read_csv('data/val.csv')
test_df=pd.read_csv('data/test.csv')

In [6]:
train_df.shape

(640, 10)

In [7]:
train_df.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Pakistan International Airlines,2019-06-21,Peshawar,Sukkur,10:20:00,11:50:00,90,0.0,In-flight meal not included,4995
1,AIR Blue,2019-05-18,Islamabad,Rahim Yar Khan,09:00:00,07:40:00,1360,1.0,No Info,8372
2,AIR Blue,2019-06-12,Quetta,Lahore,09:10:00,11:05:00,1555,2.0,No Info,6117
3,Pearl Air,2019-04-01,Quetta,Lahore,20:20:00,22:55:00,1595,1.0,No Info,7770
4,Pearl Air,2019-06-06,Quetta,Lahore,17:00:00,10:45:00,1065,1.0,No Info,9187


In [8]:
def spliting (train_df):
    X_train=train_df.iloc[:,:-1]
    y_train=train_df.iloc[:,-1]
    return X_train,y_train

In [9]:
data=pd.concat([train_df,val_df],axis=0)
X_train,y_train=spliting(data)
X_test,y_test=spliting(test_df)

In [10]:
dt_cols=['date_of_journey','dep_time','arrival_time']
int_cols=['duration','total_stops']
st_cols=['airline','source','destination','additional_info']


In [11]:
int_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="median")),
    ('scalar',StandardScaler())
])

st_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ('encoder',OneHotEncoder(sparse_output=True)), ('scalar',StandardScaler(with_mean=False))
])

In [12]:
dt_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ("Date_extraction",DatetimeFeatures(features_to_extract=['month','year','day_of_week','day_of_month'],format='mixed')),
    ('scalar',StandardScaler())
])
ms_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ("Date_extraction",DatetimeFeatures(features_to_extract=['hour','minute'],format='mixed')),
    ('scalar',StandardScaler())
])


In [13]:
preprocessor=ColumnTransformer(transformers=[
    ('num',int_transformer,int_cols),
    ('str',st_transformer,st_cols),
    ('dat',dt_transformer,['date_of_journey']),
    ('time',ms_transformer,['dep_time','arrival_time'])
])

In [19]:
preprocessor.fit_transform(X_train)

array([[-1.09640877, -1.22355058,  0.        , ..., -0.16828778,
        -0.33178455,  1.49979843],
       [ 1.45958065,  0.30588765,  2.7816035 , ..., -1.25620881,
        -0.91578376,  0.89762089],
       [ 1.85203572,  1.83532587,  2.7816035 , ..., -0.7122483 ,
        -0.33178455, -1.21000049],
       ...,
       [-0.22093208,  0.30588765,  0.        , ...,  1.46359375,
        -0.18578475,  0.59653212],
       [ 1.85203572,  0.30588765,  0.        , ...,  1.46359375,
        -1.93778238,  1.19870966],
       [-0.92533861, -1.22355058,  0.        , ...,  0.10369247,
         1.12821347, -0.30673418]])

In [14]:
algorithms={
    "Linear":LinearRegression(),
    "SVR":SVR(),
    'Random_forest':RandomForestRegressor(n_estimators=100),
    "xgb":XGBRegressor(n_estimators=100)
}

In [15]:
    
for i,j in algorithms.items():     
    model=Pipeline(steps=[
        ('Pre',preprocessor),
        ('Model',j)
    ])
    model.fit(X_train, y_train)
    print ( f"The score of the {i} is { model.score(X_train,y_train)}")
    print ( f"The test score of the  is { model.score(X_test,y_test)}")

The score of the Linear is 0.6908295316504267
The test score of the  is 0.7106316023957702
The score of the SVR is -0.016534799466854988
The test score of the  is 0.009493551775098696
The score of the Random_forest is 0.9639993999373503
The test score of the  is 0.7187317404924431
The score of the xgb is 0.9975748062133789
The test score of the  is 0.580072283744812


In [20]:
#xgb is giving the best result , so we will go with xgb
model1=Pipeline(steps=[
        ('Pre',preprocessor),
        ('Model',RandomForestRegressor())
    ])

model1.fit(X_train, y_train)
print ( f"The score of the  is { model1.score(X_train,y_train)}")
print ( f"The score of the  is { model1.score(X_test,y_test)}")

The score of the  is 0.9659447583928306
The score of the  is 0.7211804976676646


In [21]:
joblib.dump(model1,'Model.joblib')

['Model.joblib']

In [22]:
# import numpy as np
# import pandas as pd
# import sklearn
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.linear_model import LinearRegression
# from sklearn.svm import SVR
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import r2_score
# from sklearn.model_selection import learning_curve

# from feature_engine.datetime import DatetimeFeatures

# from xgboost import XGBRegressor

# import joblib

# import matplotlib.pyplot as plt

# train_df = pd.read_csv("data/train.csv")
# val_df = pd.read_csv("data/val.csv")
# test_df = pd.read_csv("data/test.csv")

# def split_data(data):
# 	X = data.drop(columns="price")
# 	y = data.price.copy()
# 	return (X, y)

# X_train, y_train = split_data(train_df)
# X_val, y_val = split_data(val_df)
# X_test, y_test = split_data(test_df)

# dt_cols = ["date_of_journey", "dep_time", "arrival_time"]

# num_cols = ["duration", "total_stops"]

# cat_cols = [col for col in X_train.columns if (col not in dt_cols) and (col not in num_cols)]

# num_transformer = Pipeline(steps=[
# 	("imputer", SimpleImputer(strategy="median")),
# 	("scaler", StandardScaler())
# ])

# cat_transformer = Pipeline(steps=[
# 	("imputer", SimpleImputer(strategy="most_frequent")),
# 	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
# ])

# doj_transformer = Pipeline(steps=[
# 	("imputer", SimpleImputer(strategy="most_frequent")),
# 	("extractor", DatetimeFeatures(features_to_extract=["month", "week", "day_of_week", "day_of_month"], format="mixed")),
# 	("scaler", StandardScaler())
# ])

# time_transformer = Pipeline(steps=[
# 	("imputer", SimpleImputer(strategy="most_frequent")),
# 	("extractor", DatetimeFeatures(features_to_extract=["hour", "minute"], format="mixed")),
# 	("scaler", StandardScaler())
# ])

# preprocessor = ColumnTransformer(transformers=[
# 	("num", num_transformer, num_cols),
# 	("cat", cat_transformer, cat_cols),
# 	("doj", doj_transformer, ["date_of_journey"]),
# 	("time", time_transformer, ["dep_time", "arrival_time"])
# ])

# preprocessor.fit_transform(X_train)

# algorithms={
#     "Linear":LinearRegression(),
#     "SVR":SVR(),
#     'Random_forest':RandomForestRegressor(n_estimators=100),
#     "xgb":XGBRegressor(n_estimators=100)
# }
    
# for algorithm_name, algorithm in algorithms.items():     
#     model = Pipeline(steps=[
#         ('Pre', preprocessor),
#         ('Model', algorithm)
#     ])
#     model.fit(X_train, y_train)
#     print(f"The score of the {algorithm_name} is {model.score(X_train, y_train)}")