# In-depth Analysis and Prediction of Hotel Demand
Aknur Kassym
Wei Kuo


## 0 - Import, clean, and basic information

In [2]:
# libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# %pip install plotly
import plotly.express as px

# %pip install folium
import folium
from folium.plugins import HeatMap

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# importing the data
# filepath = "hotel_bookings.csv"
# filepath = "/content/sample_data/hotel_bookings.csv"
filepath = "hotel_bookings.csv"

df = pd.read_csv(filepath)

df.head(10)
# df.info()
# df.describe()
# df.isnull().sum()

# show the columns with null values and the sum of null values
null_list = df.columns[df.isnull().any()].tolist()
for i in null_list:
    print(i, df[i].isnull().sum())

children 4
country 488
agent 16340
company 112593


In [5]:
# cleaning the data
# drop the agent, and company columns
df = df.drop(['agent', 'company'], axis=1)

# replace null values in country with 'other country'
df['country'] = df['country'].fillna('other country')

# drop the rows which childern is null
df = df.dropna(subset=['children'])

# check the null values again
null_list = df.columns[df.isnull().any()].tolist()
for i in null_list:
    print(i, df[i].isnull().sum())

In [6]:
# basic information
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119386 entries, 0 to 119389
Data columns (total 30 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119386 non-null  object 
 1   is_canceled                     119386 non-null  int64  
 2   lead_time                       119386 non-null  int64  
 3   arrival_date_year               119386 non-null  int64  
 4   arrival_date_month              119386 non-null  object 
 5   arrival_date_week_number        119386 non-null  int64  
 6   arrival_date_day_of_month       119386 non-null  int64  
 7   stays_in_weekend_nights         119386 non-null  int64  
 8   stays_in_week_nights            119386 non-null  int64  
 9   adults                          119386 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119386 non-null  int64  
 12  meal            

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,C,0,No Deposit,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,C,0,No Deposit,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,A,0,No Deposit,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,D,0,No Deposit,0,Transient,105.5,0,0,Canceled,2015-04-22


## 1 - Exploratory data analysis

In [7]:
# Where do the guests come from?
# filter out the rows where 'is_canceled' is 0
df_country = df[df['is_canceled'] == 0]
df_country.pivot_table(index=['country'], aggfunc='size').sort_values(ascending=False)

# create a pie chart to show the top 10 countries along with the legend
fig = px.pie(
    df_country,
    values=df_country['country'].value_counts().head(10).values,
    names=df_country['country'].value_counts().head(10).index,
    title='Top 10 Countries',
    template='seaborn',
    width=600,
    height=600
)

fig.show()

# df_country['country'].value_counts().head(10).plot.pie(autopct='%1.1f%%', figsize=(10, 10))
# plt.title('Top 10 Countries')
# plt.legend()
# plt.show()




In [8]:
# How much do guests pay for a room per night?

df_guest = df[df['is_canceled'] == 0]
# create a bar graph where y axis is the average adr and x axis is the reserved
# room type
fig = px.bar(
    df_guest,
    y=df_guest.groupby(['reserved_room_type'])['adr'].mean().values,
    x=df_guest.groupby(['reserved_room_type'])['adr'].mean().index,
    title='Average Price of Room Types',
    template='seaborn',
    width=600,
    height=600
)

fig.update_xaxes(title_text='Room Type')
fig.update_yaxes(title_text='Average Price ($)')

fig.show()


In [9]:
# How does the price per night vary over the year?
df_price = df[df['is_canceled'] == 0]

# Sort the months in order
ordered_months = ["January", "February", "March", "April", "May", "June",
                  "July", "August", "September", "October", "November", "December"]
df_price['arrival_date_month'] = pd.Categorical(df_price['arrival_date_month'], categories=ordered_months, ordered=True)

# Group by month, room type, and calculate average ADR
grouped_data = df_price.groupby(['arrival_date_month'])['adr'].mean().reset_index()
# grouped_data = df_price.groupby(['arrival_date_month', 'reserved_room_type'])['adr'].mean().reset_index()


fig = px.line(
    grouped_data,
    x='arrival_date_month',
    y='adr',
    hover_data=['adr'],
    # color='reserved_room_type',
    title='Average Price Per Night Over the Year',
    labels={'adr': 'Average Price', 'arrival_date_month': 'Month'},
    template='seaborn',
    width=600,
    height=400
)

fig.update_xaxes(title_text='Month')
fig.update_yaxes(title_text='Average Price')

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
# Bookings by market segment
df_booking_segments = df[df['is_canceled'] == 0]
df_booking_segments.pivot_table(index=['market_segment'], aggfunc='size').sort_values(ascending=False)


# create a pie chart to show the bookings by market segment along with the legend
fig = px.pie(
    df_country,
    values=df_country['market_segment'].value_counts().values,
    names=df_country['market_segment'].value_counts().index,
    title='Bookings by Market Segment',
    template='seaborn',
    width=600,
    height=600
)

fig.show()

In [11]:
# How long do people stay at the hotels?

df_duration = df[df['is_canceled'] == 0]

# Sort the months in order
df_duration['arrival_date_month'] = pd.Categorical(df_price['arrival_date_month'], categories=ordered_months, ordered=True)
df_duration['stays_total'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']

grouped_duration = df_duration.groupby(by=['arrival_date_month'])['stays_total'].mean().reset_index()

fig = px.line(
    grouped_duration,
    x='arrival_date_month',
    y='stays_total',
    hover_data=['stays_total'],
    title='Average stays booked for each month',
    labels={'stays_total': 'Average stays booked', 'arrival_date_month': 'Month'},
    template='seaborn',
    width=600,
    height=600,
)

fig.update_xaxes(title_text='Month')
fig.update_yaxes(title_text='Average Stays')

fig.show()





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [14]:
# How many bookings were canceled?
df_cancelled = df[df['is_canceled'] == 1]

df['arrival_date_month'] = pd.Categorical(df['arrival_date_month'], categories=ordered_months, ordered=True)

# Separate canceled bookings for different types of hotels
resort_cancelled = df_cancelled[df_cancelled['hotel'] == 'Resort Hotel']
city_hotel_cancelled = df_cancelled[df_cancelled['hotel'] == 'City Hotel']

resort_cancelled_counts = resort_cancelled['arrival_date_month'].value_counts().sort_index()
city_hotel_cancelled_counts = city_hotel_cancelled['arrival_date_month'].value_counts().sort_index()

# Combine the data into a single DataFrame for plotting
combined_cancellation_data = pd.DataFrame({
    'Month': resort_cancelled_counts.index,
    'Resort Hotel': resort_cancelled_counts.values,
    'City Hotel': city_hotel_cancelled_counts.values
})

melted_data = combined_cancellation_data.melt(id_vars='Month', var_name='Hotel', value_name='Cancellations')

fig = px.line(
    melted_data,
    x='Month',
    y='Cancellations',
    color='Hotel',
    title='Cancellations for Resort and City Hotels each month',
    labels={'Cancellations': 'Cancellations', 'Month': 'Month'},
    template='seaborn',
    width=800,
    height=600,
)

fig.update_xaxes(title_text='Month')
fig.update_yaxes(title_text='Cancellations')

fig.show()

In [13]:
# Which month has the highest number of guests?

df_guests = df[df['is_canceled'] == 0]
df_guests['guest_total'] = df_guests['adults'] + df_guests['children'] + df_guests['babies']

df['arrival_date_month'] = pd.Categorical(df['arrival_date_month'], categories=ordered_months, ordered=True)

resort_guests = df_guests[df_guests['hotel'] == 'Resort Hotel']
city_hotel_guests = df_guests[df_guests['hotel'] == 'City Hotel']

resort_guests_per_month = resort_guests.groupby('arrival_date_month')['guest_total'].sum()
city_hotel_guests_per_month = city_hotel_guests.groupby('arrival_date_month')['guest_total'].sum()

# Combine the data into a single DataFrame for plotting
combined_guests_data = pd.DataFrame({
    'Month': resort_guests_per_month.index,
    'Resort Hotel': resort_guests_per_month.values,
    'City Hotel': city_hotel_guests_per_month.values
})

melted_data = combined_guests_data.melt(id_vars='Month', var_name='Hotel', value_name='Guests')

# Plotting the line chart for guest volume per month for both hotels
fig = px.line(
    melted_data,
    x='Month',
    y='Guests',
    color='Hotel',
    title='Volume of guests for Resort and City Hotels each month',
    labels={'Guests': 'Guests', 'Month': 'Month'},
    template='seaborn',
    width=800,
    height=600,
)

fig.update_xaxes(title_text='Month')
fig.update_yaxes(title_text='Guests')

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## 2 - Prediction

Converting Categorical Columns to Numerical Values

When converting categorical columns to numerical values, it's essential to preserve the categorical information without introducing false ordinal relationships or losing the inherent meaning of the categories. Two common approaches are Label Encoding and One-Hot Encoding, each with its own considerations:

1. **Label Encoding:**
   - **What it does:** Assigns a unique numerical label to each category in a column.
   - **Usage:** Works well for ordinal categorical data (where there's an inherent order among categories).
   - **Consideration:** Might imply false ordinal relationships where none exist. For non-ordinal data, it could lead to incorrect assumptions by models.  
   

2. **One-Hot Encoding:**
   - **What it does:** Creates binary columns for each category, where each column represents a category with a 1 or 0.
   - **Usage:** Suitable for nominal categorical data (where categories have no intrinsic order).
   - **Consideration:** Increases dimensionality, potentially causing issues with computational resources if there are many unique categories.

### Part 0. Data Processing

In [15]:
# libraries
# for ML:
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [16]:
# encoding the catergorical data to numerical data for performing correlation
# analysis

# find the numerical columns
df_numerical = df.select_dtypes(include=np.number)
# numerical_columns = df_numerical.columns

# find the categorical columns
df_categorical = df.select_dtypes(exclude=np.number)
# categorical_columns = df_categorical.columns

# encode the categorical columns
encoder = LabelEncoder()
df_categorical = df_categorical.apply(encoder.fit_transform)

# combine the numerical and categorical columns
df_encoded = pd.concat([df_numerical, df_categorical], axis=1)
df_encoded.head(5)

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,0,342,2015,27,1,0,0,2,0.0,0,...,0,135,3,1,2,2,0,2,1,121
1,0,737,2015,27,1,0,0,2,0.0,0,...,0,135,3,1,2,2,0,2,1,121
2,0,7,2015,27,1,0,1,1,0.0,0,...,0,59,3,1,0,2,0,2,1,122
3,0,13,2015,27,1,0,1,1,0.0,0,...,0,59,2,0,0,0,0,2,1,122
4,0,14,2015,27,1,0,2,2,0.0,0,...,0,59,6,3,0,0,0,2,1,123


### Part 1. Cancelation

In [17]:
# find the correlations

corr = df_numerical.corr()["is_canceled"]
corr.abs().sort_values(ascending=False)[:]

is_canceled                       1.000000
lead_time                         0.293177
total_of_special_requests         0.234706
required_car_parking_spaces       0.195492
booking_changes                   0.144371
previous_cancellations            0.110140
is_repeated_guest                 0.084788
adults                            0.059990
previous_bookings_not_canceled    0.057355
days_in_waiting_list              0.054193
adr                               0.047622
babies                            0.032488
stays_in_week_nights              0.024771
arrival_date_year                 0.016732
arrival_date_week_number          0.008132
arrival_date_day_of_month         0.006084
children                          0.005048
stays_in_weekend_nights           0.001783
Name: is_canceled, dtype: float64

In [18]:
# # find the correlation between the columns
# plt.figure(figsize = (24, 12))

# corr = df_encoded.corr()
# sns.heatmap(corr, annot = True, linewidths = 1)
# plt.show()

fig = px.imshow(
    df_encoded.corr(),
    title='Correlation Heatmap',
    color_continuous_scale='RdBu',
    # on each cell, show the correlation value
    zmin=-0.5, zmax=1,
    template='seaborn',
    width=1000,
    height=1000
)
fig.show()


In [35]:
cancel_corr = df_encoded.corr()['is_canceled']
cancel_corr.abs().sort_values(ascending = False)[:13]

is_canceled                    1.000000
reservation_status             0.917191
deposit_type                   0.468665
lead_time                      0.293177
country                        0.264194
total_of_special_requests      0.234706
required_car_parking_spaces    0.195492
assigned_room_type             0.176025
distribution_channel           0.167544
reservation_status_date        0.162077
booking_changes                0.144371
hotel                          0.136505
previous_cancellations         0.110140
Name: is_canceled, dtype: float64

In [43]:
from sklearn.preprocessing import StandardScaler

# preserve the features that are correlated with cancelation
# select 10 features with the highest correlation apart from is_cancelled and reservation_status
features = cancel_corr.abs().sort_values(ascending = False)[3:13]


# create a new dataframe with the selected features
df_selected = df_encoded[features.index]

# normalizing the data
scaler = StandardScaler()
df_selected_normalized = scaler.fit_transform(df_selected)

# convert df_selected to np array
df_selected_normalized = scaler.fit_transform(df_selected)

In [44]:
# split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    df_selected_normalized,
    df_encoded['is_canceled'],
    test_size=0.3,
    shuffle=True,
    random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
model_scores = {}

#### a. Logistic Regression

In [45]:
# 1. Logistic Regression

logistic_model = LogisticRegression(max_iter=2000)
logistic_model.fit(X_train, y_train)
logistic_prediction = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_prediction)
print('Logistic Regression Accuracy: ', logistic_accuracy)

model_scores['Logistic Regression'] = logistic_accuracy


Logistic Regression Accuracy:  0.7556678579405852


#### b. Decision Tree

In [46]:
# 2. Decision Tree
decision_tree_model = DecisionTreeClassifier(max_depth=20, random_state=42)
decision_tree_model.fit(X_train, y_train)
decision_tree_prediction = decision_tree_model.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_prediction)
print('Decision Tree Accuracy: ', decision_tree_accuracy)

model_scores['Decision Tree'] = decision_tree_accuracy

Decision Tree Accuracy:  0.8509883850792942


#### c. k-Nearest Neighbors

In [47]:
# 3. k-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
knn_prediction = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_prediction)
print('k-Nearest Neighbors Accuracy: ', knn_accuracy)

model_scores['k-Nearest Neighbors'] = knn_accuracy

k-Nearest Neighbors Accuracy:  0.8500111681929864


#### d. Neural Networks

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset


In [26]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_model = nn.Sequential(
            nn.Linear(10, 200),
            nn.ReLU(),
            nn.Linear(200, 200),
            nn.ReLU(),
            nn.Linear(200, 2),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, x):
        # make sure input tensor is flattened
        x = x.view(x.shape[0], -1)

        # # adding dropout layer
        # x = F.dropout(x, p=0.2)
        x = self.fc_model(x)

        return x


In [27]:
def train(train_loader, test_loader, epochs, learning_rate):
    # create the model
    model = Classifier()
    criterion = torch.nn.CrossEntropyLoss()
    # define the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # initialize variables to monitor training and test loss
    train_losses = []
    test_losses = []
    test_accuracy = []


    for epoch in range(epochs):

        # initialize variables to monitor training and test loss
        train_loss = 0.0
        test_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0

        # train the model
        model.train()
        for data, target in train_loader:
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update training loss
            train_loss += loss.item() * data.size(0)

        # validate the model
        model.eval()
        for data, target in test_loader:
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # update average validation loss
            test_loss += loss.item() * data.size(0)

            # calculate the accuracy
            _, predicted = torch.max(output.data, 1)
            total_predictions += target.size(0)
            correct_predictions += (predicted == target).sum().item()


        # calculate average losses
        train_loss = train_loss / len(train_loader.dataset)
        test_loss = test_loss / len(test_loader.dataset)

        # append training and test loss
        train_losses.append(train_loss)
        test_losses.append(test_loss)

        # calculate the accuracy
        accuracy = 100 * correct_predictions / total_predictions
        test_accuracy.append(accuracy)

        # print training/validation and accuracy statistics
        print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f} \tAccuracy: {:.6f}'.format(
            epoch + 1,
            train_loss,
            test_loss,
            accuracy
        ))
    print('\n')
    return train_losses, test_losses, test_accuracy

In [28]:
# 4. Neural Network
# normalizing the data
scaler = StandardScaler()
df_selected_normalized = scaler.fit_transform(df_selected)

# split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    df_selected_normalized,
    df_encoded['is_canceled'],
    test_size=0.3,
    shuffle=True,
    random_state=42
)

train_data = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train.values).long())
test_data = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test.values).long())

# parameters
batch_size = [16, 64]
learning_rates = [0.1, 0.5]
epochs = 10

# track losses
train_losses_history = {}
test_losses_history = {}
test_accuracy_history = {}

for batch in batch_size:
    for learning_rate in learning_rates:
        print('batch size: ', batch, 'learning rate: ', learning_rate, 'epochs: ', epochs)
        train_loader = DataLoader(train_data, batch_size=batch, shuffle=True)
        test_loader = DataLoader(test_data, batch_size=batch)

        train_losses, test_losses, test_accuracy = train(train_loader, test_loader, epochs, learning_rate)

        train_losses_history['batch size: ' + str(batch) + ' learning rate: ' + str(learning_rate)] = train_losses
        test_losses_history['batch size: ' + str(batch) + ' learning rate: ' + str(learning_rate)] = test_losses
        test_accuracy_history['batch size: ' + str(batch) + ' learning rate: ' + str(learning_rate)] = test_accuracy

batch size:  16 learning rate:  0.1 epochs:  10
Epoch: 1 	Training Loss: 0.661788 	Test Loss: 0.630320 	Accuracy: 62.787581
Epoch: 2 	Training Loss: 0.685978 	Test Loss: 0.660656 	Accuracy: 62.787581
Epoch: 3 	Training Loss: 0.663993 	Test Loss: 0.664276 	Accuracy: 62.787581
Epoch: 4 	Training Loss: 0.664693 	Test Loss: 0.660725 	Accuracy: 62.787581
Epoch: 5 	Training Loss: 0.664346 	Test Loss: 0.660488 	Accuracy: 62.787581
Epoch: 6 	Training Loss: 0.664525 	Test Loss: 0.662799 	Accuracy: 62.787581
Epoch: 7 	Training Loss: 0.664143 	Test Loss: 0.680539 	Accuracy: 62.787581
Epoch: 8 	Training Loss: 0.665089 	Test Loss: 0.675236 	Accuracy: 62.787581
Epoch: 9 	Training Loss: 0.664404 	Test Loss: 0.660077 	Accuracy: 62.787581
Epoch: 10 	Training Loss: 0.664436 	Test Loss: 0.661598 	Accuracy: 62.787581


batch size:  16 learning rate:  0.5 epochs:  10
Epoch: 1 	Training Loss: 1.532313 	Test Loss: 0.714179 	Accuracy: 37.212419
Epoch: 2 	Training Loss: 0.682157 	Test Loss: 0.687956 	Accuracy:

In [29]:
# compare the accuracy of each model
model_scores_df = pd.DataFrame(model_scores, index=['Accuracy']).transpose()


fig = px.bar(
    model_scores_df,
    y='Accuracy',
    x=model_scores_df.index,
    title='Accuracy of Each Model',
    template='seaborn',
    width=600,
    height=400
)

# change the scale of y axis to 0.5 to 1
fig.update_yaxes(range=[0.5, 1])
fig.update_xaxes(title_text='Model')

fig.show()

In [30]:
def plot(train_loss, test_loss, test_accuracy):
    # Create a DataFrame combining the data for Plotly Express
    data = pd.DataFrame({
        'Epoch': range(1, len(train_loss) + 1),
        'Training Loss': train_loss,
        'Test Loss': test_loss,
        # 'Test Accuracy': test_accuracy
    })

    # Melt the DataFrame to have a variable column for plotly express
    melted_data = data.melt(id_vars='Epoch', var_name='Metric', value_name='Value')

    # Create the plot using Plotly Express
    fig = px.line(
        melted_data,
        x='Epoch',
        y='Value',
        color='Metric',
        labels={'Epoch': 'Epoch', 'Value': 'Value', 'Metric': 'Metric'},
        title='Training Loss, Test Lost, Accuracy',
        template='plotly_dark',
        width=800,
        height=400
    )

    # Center the title
    fig.update_layout(title_x=0.5)

    fig.show()



In [31]:
# graph the train loss and test loss
for batch in batch_size:
    for learning_rate in learning_rates:
        train_loss = train_losses_history['batch size: ' + str(batch) + ' learning rate: ' + str(learning_rate)]
        test_loss = test_losses_history['batch size: ' + str(batch) + ' learning rate: ' + str(learning_rate)]
        test_accuracy = test_accuracy_history['batch size: ' + str(batch) + ' learning rate: ' + str(learning_rate)]

        fail_epoch = []

        for i in range(len(train_loss)):
            if train_loss[i] > 1:
                fail_epoch.append(i)

        # drop the fail epoch from train_loss, test_loss, test_accuracy
        train_loss = np.delete(train_loss, fail_epoch)
        test_loss = np.delete(test_loss, fail_epoch)
        test_accuracy = np.delete(test_accuracy, fail_epoch)

        plot(train_loss, test_loss, test_accuracy)