In [267]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [268]:
df = pd.read_csv('oppe4.csv')

In [269]:
df.shape

(9450, 13)

In [270]:
df['Price'].mean().round(3)

np.float64(9027.896)

In [271]:
df['Month'].value_counts()

Month
5    3092
6    3044
3    2388
4     926
Name: count, dtype: int64

In [272]:
df[(df['WeekDay']=='Saturday') | (df['WeekDay']=='Sunday')]['Price'].mean()

np.float64(9058.016077170418)

In [273]:
df[~((df['WeekDay']=='Saturday') | (df['WeekDay']=='Sunday'))]['Price'].mean()

np.float64(9015.219666215608)

Two of the entries in the 'Additional_Info' column are 'No info' and 'No Info'. Replace all occurrences of 'No Info' with 'No info'. How many flights fall under airline 'IndiGo' and have 'No info' as additional information?"

In [274]:
df['Additional_Info'].value_counts()

Additional_Info
No info                         7375
In-flight meal not included     1750
No check-in baggage included     298
1 Long layover                    13
Change airports                    5
No Info                            3
2 Long layover                     3
1 Short layover                    2
Business class                     1
Name: count, dtype: int64

In [275]:
df = df.replace({'No Info': 'No info'})

In [276]:
df['Additional_Info'].value_counts()

Additional_Info
No info                         7378
In-flight meal not included     1750
No check-in baggage included     298
1 Long layover                    13
Change airports                    5
2 Long layover                     3
1 Short layover                    2
Business class                     1
Name: count, dtype: int64

In [277]:
df[(df['Airline']=='IndiGo') & (df['Additional_Info']=='No info')].shape[0]

1650

Convert the values of 'Duration' into seconds. Enter the average duration (in seconds) of a flight. Enter your answer correct to two decimal places.

In [278]:
df['Duration'] = df['Duration'].str.replace('h', '*3600')
df['Duration'] = df['Duration'].str.replace('m', '*60')
df['Duration'] = df['Duration'].str.replace(' ', '+')
df['Duration'] = df['Duration'].apply(pd.eval)
df['Duration'].mean()


np.float64(38957.93650793651)

Apply the following functions to the columns Dep_Time and Arrival_Time:
Transform the values in the 'dep_time' and 'arrival_time' columns to represent the hour component. For instance, if an entry is 10:05 June 13 or 10:05, the corresponding value should be 10.

Then convert the time into four categories as follows:

5 <= hour < 12 = Morning
12 <= hour < 17 = Afternoon
17 <= hour < 20 = Evening
20 <= hour < 5 = Night

In [279]:
def time_interval(x):
    if 5<=x<12:
        return 'Morning'
    elif 12<=x<17:
        return 'Afternoon'
    elif 17<=x<20:
        return 'Evening'
    elif 20<=x<5:
        return 'Night'

In [280]:
df[['Arrival_Time', 'Dep_Time', 'Duration']].head()

Unnamed: 0,Arrival_Time,Dep_Time,Duration
0,04:25 10 Jun,20:00,95700
1,19:00 10 Jun,16:00,97200
2,21:05,19:35,5400
3,01:30 16 Jun,18:55,54600
4,17:55,17:10,30000


In [281]:
df['Dep_Time'] = df['Dep_Time'].apply(lambda x: int(str(x).split(':')[0]))
df['Arrival_Time'] = df['Arrival_Time'].apply(lambda x: int(str(x).split(':')[0].split(' ')[0]))

df['Dep_Time'] = df['Dep_Time'].apply(time_interval)
df['Arrival_Time'] = df['Arrival_Time'].apply(time_interval)


In [282]:
df[(df['Dep_Time']=='Morning') & (df['Arrival_Time']=='Evening')].shape[0]

922

In [283]:
df['WeekDay'] = df['WeekDay'].replace({'Saturday': 1, 'Sunday': 1,
                                          'Monday': 0, 'Tuesday': 0, 'Wednesday': 0, 'Thursday': 0, 'Friday': 0})
df['WeekDay'].head()

  df['WeekDay'] = df['WeekDay'].replace({'Saturday': 1, 'Sunday': 1,


0    0
1    1
2    0
3    0
4    0
Name: WeekDay, dtype: int64

In [284]:
df['WeekDay'].value_counts()

WeekDay
0    6651
1    2799
Name: count, dtype: int64

In [285]:
from sklearn.model_selection import train_test_split

X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

How many rows are in the feature matrix of the test dataset?

In [286]:
X_test.shape[0]

1890

Apply preprocessing on features of train and test datasets.
Drop the column Route.

Impute the missing values in the column Total_Stops with the MOST FREQUENT value.

Use OrdinalEncoder to encode the following categorical features:

Airline : Use default categories
Source : Use default categories
Destination : Use default categories
Additional_Info: Use default categories
Total_Stops : Use the following order for encoding:
'non-stop', '1 stop', '2 stops', '3 stops', '4 stops'
Dep_Time : Use the following order for encoding:
'Morning', 'Afternoon','Evening', 'Night'
Arrival_Time: Use the following order for encoding:
'Morning', 'Afternoon','Evening', 'Night'
If any unkonwn values occur in test dataset during tranforming it, handle unknown values by replacing with  −1 .
Use MinMaxScaler to scale Duration.

In [287]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [288]:
X_train = X_train.drop('Route', axis=1)
X_test = X_test.drop('Route', axis=1)


In [289]:
# Impute the missing values in Total_Stops
impute_tf = [('Mode Imputer', SimpleImputer(strategy='most_frequent'), ['Total_Stops'])]
impute_ct = ColumnTransformer(transformers=impute_tf, remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

# Ordinal Encoding
encode_tf = [('Ordinal Encoding', OrdinalEncoder(), ['Airline','Source','Destination','Additional_Info']),
             ('Encoding Total Stops', OrdinalEncoder(categories=[['non-stop', '1 stop', '2 stops', '3 stops', '4 stops']], handle_unknown='use_encoded_value', unknown_value=-1), ['Total_Stops']),
             ('Encoding Dep_Time', OrdinalEncoder(categories=[['Morning', 'Afternoon','Evening', 'Night']], handle_unknown='use_encoded_value', unknown_value=-1), ['Dep_Time']),
             ('Encoding Arrival_Time', OrdinalEncoder(categories=[['Morning', 'Afternoon','Evening', 'Night']], handle_unknown='use_encoded_value', unknown_value=-1), ['Arrival_Time'])]

encode_ct = ColumnTransformer(transformers=encode_tf, remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

# MinMax Scaling the Duration feature
scale_tf = [('MinMax Scaling', MinMaxScaler(), ['Duration'])]
scale_ct = ColumnTransformer(transformers=scale_tf, remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

# Pipeline
full_pipe = Pipeline(steps=[('Impute', impute_ct), ('Encode', encode_ct), ('Scale', scale_ct)]).set_output(transform='pandas')
full_pipe

0,1,2
,steps,"[('Impute', ...), ('Encode', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Mode Imputer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('Ordinal Encoding', ...), ('Encoding Total Stops', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['non-stop', '1 stop', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Morning', 'Afternoon', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Morning', 'Afternoon', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,transformers,"[('MinMax Scaling', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [290]:
X_train_tf = full_pipe.fit_transform(X_train)
X_test_tf = full_pipe.transform(X_test)

ValueError: Found unknown categories ['Business class'] in column 3 during transform

Were any unseen values encountered in the test dataset during the transformation process?

In [None]:
# Checking for unseen values
for col in ['Airline','Source','Destination','Additional_Info','Arrival_Time','Dep_Time','Total_Stops']:
  if -1 in X_test_tf[col].values:
    print('yes')

yes
yes
yes


In [None]:
X_train_tf.max().max()

np.float64(27.0)

In [None]:
X_test_tf.mean().mean()

np.float64(3.0495425726502132)

In [None]:
X['Duration'].max()

np.int64(171600)

In [None]:
df1 = pd.read_csv('modelBuilding1.csv')

In [None]:
df1.shape

(9450, 12)

Split the dataset into train dataset and test dataset in the following manner

Use train_test_split to split the dataset into train and test dataset with test size equal to  0.2(20%)  and random_state equal to 42.
columns except the last column should be the feature matrix (X_train or X_test)
last column will be the label vector

In [None]:
from sklearn.model_selection import train_test_split
X = df1.drop('11', axis=1)
y = df1['11']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

Train the ridge model on the training data with the following parameters:

In [None]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, SGDRegressor
from sklearn.model_selection import GridSearchCV
ridge = Ridge(alpha = 10,
 solver = 'saga',
 tol = 1e-4,
 random_state = 42)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.4144406794263126

In [None]:
param_grid_sgdr = {
    'penalty': ['l1','l2'],
    'alpha': [1e-5,1e-4,1e-3,1e-2,1e-1],
    'tol': [1e-4,1e-3,1e-2,1e-1]
}

grid_sgdr = GridSearchCV(SGDRegressor(random_state=42), param_grid_sgdr, cv=5, scoring='neg_mean_absolute_error')
grid_sgdr.fit(X_train, y_train)

0,1,2
,estimator,SGDRegressor(random_state=42)
,param_grid,"{'alpha': [1e-05, 0.0001, ...], 'penalty': ['l1', 'l2'], 'tol': [0.0001, 0.001, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'squared_error'
,penalty,'l1'
,alpha,0.1
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.0001
,shuffle,True
,verbose,0
,epsilon,0.1


In [None]:
grid_sgdr.best_params_

{'alpha': 0.1, 'penalty': 'l1', 'tol': 0.0001}

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
abs(grid_sgdr.score(X_test, y_test))

2830.1822633351044

In [None]:
y_pred = grid_sgdr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(round(mae,2))

2830.18


create a pipeline of the PCA() as transformer and Lasso as an estimator.

Use GridSearchCV for tuning the hyperparameters of the created pipeline on training dataset.

values of n_components for PCA to be  [0.9,0.95] 
lasso alpha value to be taken as : [10, 1, 0.01, 0.001]
scoring : neg_mean_absolute_error.
cv = 5
n_jobs = -1 (negative one) [it helps in using all the computational power to run this job]

In [None]:
from sklearn.decomposition import PCA
pipe = Pipeline([
    ('pca', PCA()),
    ('lasso', Lasso())
])
param_grid = {
    'pca__n_components': [0.9, 0.95],
    'lasso__alpha': [10,1,0.01,0.001] 
}

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_absolute_error')
grid.fit(X_train, y_train)

0,1,2
,estimator,"Pipeline(step...o', Lasso())])"
,param_grid,"{'lasso__alpha': [10, 1, ...], 'pca__n_components': [0.9, 0.95]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,alpha,0.001
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


If we fit the pipeline on the training dataset, what will be the  R2  score on the test dataset? Enter your answer correct to three decimal places.

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, grid.predict(X_test))

0.010585851198622875

How much variance is explained by the first principle component?

In [None]:
grid.best_estimator_[0].explained_variance_

array([70.31267244,  5.07715564,  2.7325721 ,  1.54953723,  1.31167272])

create a pipeline of the PolynomialFeatures() as transformer and Lasso as an estimator with the following parameters:
For PolynomialFeatures()
interaction_only = False
degree = 4
For lasso()
alpha = 100
warm_start = True
Fit the pipeline on the training dataset and find the  R2  score on the test dataset.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
pipe=Pipeline([('poly', PolynomialFeatures(interaction_only=False, degree=4)),
                ('lasso', Lasso(alpha=100, warm_start=True))])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

  model = cd_fast.enet_coordinate_descent(


0.5240465678550417