In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
flights = pd.read_csv('data/flight_prices.csv')
flights.describe()


X_train, X_test, y_train, y_test = train_test_split(flights.copy().drop('price', axis=1), flights['price'], test_size=0.2, random_state=43)


In [12]:
flights.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


# Exercise
Using the below as a starting point, see if you can improve beat the MSE found.  Consider using feature enhacnements:

* feature engineering
* regularization
* additional pre-processing.

In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


num_attribs = ['duration', 'days_left']
cat_attribs = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class' ]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])    

feature_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs),
    ])

regression = Pipeline(
    steps=[("preprocessor", feature_pipeline), 
           ("regression", LinearRegression())]
)

regression.fit(X_train, y_train)
predictions = regression.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'RMSE: {np.sqrt(mse)}')


RMSE: 6189.550221198623


In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge

# Define categorical attributes (cat_attribs)
cat_attribs = ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

# Assuming flights is the DataFrame containing your data
# Apply Label Encoding to categorical columns
label_encoder = LabelEncoder()
for col in flights.columns:
    if flights[col].dtype == 'object':
        flights[col] = label_encoder.fit_transform(flights[col])

# One-Hot Encoding for categorical columns
one_hot_encoded = pd.get_dummies(flights, columns=cat_attribs)

# Splitting data into features and target
X = one_hot_encoded.drop(['price'], axis=1)
y = one_hot_encoded['price']

# Splitting data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Scaling the features
scaler = MinMaxScaler(feature_range=(0, 1))
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

# Define models (Ridge Regression and Linear Regression)
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge()
}

# Evaluation matrix for all the algorithms
results = []

for model_name, model in models.items():
    # Fit the model with train data
    model.fit(x_train, y_train)
    
    # Predict the model with test data
    y_pred = model.predict(x_test)
    
    # Evaluation metrics for Regression analysis
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    # Print the evaluation metrics
    print(f"Model: {model_name}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R2 Score: {r2}")
    print("-------------------------------------")
    
    # Store the results
    results.append({
        "Model Name": model_name,
        "Mean Absolute Error (MAE)": mae,
        "Mean Squared Error (MSE)": mse,
        "Root Mean Squared Error (RMSE)": rmse,
        "R2 Score": r2
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


: 