In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Read the data
sales_data = pd.read_csv(r'../Raw_Data/SalesForCourse_quizz_table.csv')

# Remove NaN values from the data
sales_data[sales_data.isnull().any(axis=1)]
sales_data.drop(34866, inplace=True)

# Convert the "Date" column into a datetime format and set it as the index for the dataframe
sales_data['Date'] = pd.to_datetime(sales_data['Date'])

# Filter the dataframe to only include data from Germany
sales_data = sales_data.loc[sales_data['Country'] == 'Germany']

In [3]:
# Convert non-numeric columns to a numeric format
month_dict = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
              'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

country_dict = {'United States': 1, 'United Kingdom': 2, 'Germany': 3, 'France': 4}

sales_data['Month'] = sales_data['Month'].replace(month_dict)
sales_data['Date'] = sales_data['Date'].apply(lambda x: x.toordinal())
sales_data['Country'] = sales_data['Country'].replace(country_dict)

In [5]:
# Define the dependent and independent variables
y = sales_data['Revenue']
X = sales_data[['Unit Price', 'Unit Cost', 'Year', 'Month', 'Date']]

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a linear regression model
model = sm.OLS(y_train, X_train).fit()

# Print the model summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                Revenue   R-squared:                       0.720
Model:                            OLS   Adj. R-squared:                  0.720
Method:                 Least Squares   F-statistic:                     1873.
Date:                Tue, 28 Feb 2023   Prob (F-statistic):               0.00
Time:                        23:11:58   Log-Likelihood:                -27595.
No. Observations:                3640   AIC:                         5.520e+04
Df Residuals:                    3634   BIC:                         5.524e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.074e+05   5.67e+04      1.895      0.0

## Holt's Linear Trend Model

#### Germany's Sales Data: Janurary 1, 2015 - February 8, 2016 

In [None]:
# Read the data
sales_data = pd.read_csv(r'../Raw_Data/SalesForCourse_quizz_table.csv')

# Remove NaN values from the data
sales_data[sales_data.isnull().any(axis=1)]
sales_data.drop(34866, inplace=True)

# Convert the "Date" column into a datetime format and set it as the index for the dataframe
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data.set_index('Date', inplace=True)

# Filter the dataframe to only include data from Germany
sales_data = sales_data.loc[sales_data['Country'] == 'Germany']

# Group the data by day and calculate the sum of the revenue for each day
sales_sum = sales_data.loc['2015-01-01':'2016-02-08']
daily_revenue = sales_sum['Revenue'].resample('D').sum()
daily_revenue

In [None]:
# Plot the daily sales data
daily_revenue.plot(figsize=(12, 6))
plt.title("Germany's Daily Revenue for January 1, 2015 to February 8, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.show()

##### Germany's Forecasted Revenue for Week of February 15 - February 28, 2016

In [None]:
model = Holt(daily_revenue)
model_fit = model.fit()
forecast = model_fit.predict(start='2016-02-15', end='2016-02-22')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(daily_revenue)
plt.plot(forecast)
plt.title("Germany's Forecasted Daily Revenue in Feb 15 - Feb 22, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Actual', 'Forecast'])
plt.show()

In [None]:
# Group the data by day and calculate the sum of the revenue for each day
sales_feb = sales_data.loc['2016-02-15':'2016-02-22']
daily_revenue = sales_feb['Revenue'].resample('D').sum()

# Convert the NumPy array to a Python list
forecast_list = forecast.tolist()

# Round the values in the list using a list comprehension
forecast_rounded = [round(value, 0) for value in forecast_list]

# Convert the rounded list back to a NumPy array
forecast_rounded = np.array(forecast_rounded)

# Print Forecasted 'Revenue' for February 15 - February 22, 2016
print('Forecasted Revenue for February 15 - February 22, 2016 is:', forecast_rounded, '\n')

# Print Actual 'Revenue' for February 15 - February 22, 2016 
print('Actual Revenue for February 15 - February 22, 2016 is:', daily_revenue.values)

In [None]:
# Plot the forecasted and actual revenue data
plt.figure(figsize=(12, 6))
plt.plot(forecast)
plt.plot(daily_revenue)
plt.title("Germany's Forecasted and Actual Revenue in February 15-28 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Forecast', 'Actual'])
plt.show()

In [None]:
# Calculate the mae, mse, rmse, and r2 of the model
mae = mean_absolute_error(daily_revenue, forecast)
mse = mean_squared_error(daily_revenue, forecast)
rmse = np.sqrt(mse)
r2 = r2_score(daily_revenue, forecast)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R2:', r2)

The Holt's Linear Trend Model was used to predict revenue for February 15 - February 22, 2016. The predicted revenue ranged from $64,828.75 to $65,918.40. However, the model's performance was not very good as indicated by the large Mean Absolute Error (MAE) of $10,506, and the large Mean Squared Error (MSE) of 256043002.27. The Root Mean Squared Error (RMSE) was also large, suggesting that the model is making large errors in its predictions. The R2 value was small, indicating that the model may need to be improved or replaced with a different model that is better suited to this dataset.

#### Germany's Sales Data: February 1, 2015 - February 28, 2015 

In [None]:
# Read the data
sales_data = pd.read_csv(r'../Raw_Data/SalesForCourse_quizz_table.csv')

# Remove NaN values from the data
sales_data[sales_data.isnull().any(axis=1)]
sales_data.drop(34866, inplace=True)

# Convert the "Date" column into a datetime format and set it as the index for the dataframe
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data.set_index('Date', inplace=True)

# Filter the dataframe to only include data from Germany
sales_data = sales_data.loc[sales_data['Country'] == 'Germany']

# Group the data by day and calculate the sum of the revenue for each day
sales_sum = sales_data.loc['2015-02-01':'2015-02-28']
daily_revenue = sales_sum['Revenue'].resample('D').sum() 
daily_revenue

In [None]:
# Plot the daily sales data
daily_revenue.plot(figsize=(12, 6))
plt.title("Germany's Daily Revenue for February 1, 2015 to February 28, 2015")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.show()

##### Germany's Forecasted Revenue for Week of February 15 - February 28, 2016

In [None]:
model = Holt(daily_revenue)
model_fit = model.fit()
forecast = model_fit.predict(start='2016-02-15', end='2016-02-22')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(daily_revenue)
plt.plot(forecast)
plt.title("Germany's Forecasted Daily Revenue in Feb 15 - Feb 22, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Actual', 'Forecast'])
plt.show()

In [None]:
# Group the data by day and calculate the sum of the revenue for each day
sales_feb = sales_data.loc['2016-02-15':'2016-02-22']
daily_revenue = sales_feb['Revenue'].resample('D').sum()

# Convert the NumPy array to a Python list
forecast_list = forecast.tolist()

# Round the values in the list using a list comprehension
forecast_rounded = [round(value, 0) for value in forecast_list]

# Convert the rounded list back to a NumPy array
forecast_rounded = np.array(forecast_rounded)

# Print forecasted values for the next 8 days
print('Forecasted Revenue for February 15 - February 22, 2016 is:', forecast_rounded, '\n')

# Print 'Revenue' for February 15 - February 22, 2016 usings the sales_feb dataframe
print('Actual Revenue for February 15 - February 22, 2016 is:', daily_revenue.values)

In [None]:
# Plot the forecasted and actual revenue data
plt.figure(figsize=(12, 6))
plt.plot(forecast)
plt.plot(daily_revenue)
plt.title("Germany's Forecasted and Actual Revenue in February 15-28 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Forecast', 'Actual'])
plt.show()


In [None]:
# Calculate the mae, mse, rmse, and r2 of the model
mae = mean_absolute_error(daily_revenue, forecast)
mse = mean_squared_error(daily_revenue, forecast)
rmse = np.sqrt(mse)
r2 = r2_score(daily_revenue, forecast)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R2:', r2)

The Holt's Linear Trend Model was used to predict revenue for February 15 - February 22, 2016. However, the model's performance was poor as it resulted in a large Mean Absolute Error (MAE) of $62,643, Mean Squared Error (MSE) of 4.18 billion, and a Root Mean Squared Error (RMSE) of $64,675.57. The R2 value for the model was -15.19, indicating that the model is performing worse than before.

## Simple Exponential Smoothing

#### Germany's Sales Data: Janurary 1, 2015 - February 8, 2016 

In [None]:
# Read the data
sales_data = pd.read_csv(r'../Raw_Data/SalesForCourse_quizz_table.csv')

# Remove NaN values from the data
sales_data[sales_data.isnull().any(axis=1)]
sales_data.drop(34866, inplace=True)

# Convert the "Date" column into a datetime format and set it as the index for the dataframe
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data.set_index('Date', inplace=True)

# Filter the dataframe to only include data from Germany
sales_data = sales_data.loc[sales_data['Country'] == 'Germany']

# Group the data by day and calculate the sum of the revenue for each day
sales_sum = sales_data.loc['2015-01-01':'2016-02-08']
daily_revenue = sales_sum['Revenue'].resample('D').sum()
daily_revenue

In [None]:
# Plot the daily sales data
daily_revenue.plot(figsize=(12, 6))
plt.title("Germany's Daily Revenue for January 1, 2015 to February 8, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.show()

##### Germany's Forecasted Revenue for Week of February 15 - February 28, 2016

In [None]:
model = SimpleExpSmoothing(daily_revenue)
model_fit = model.fit()
forecast = model_fit.predict(start='2016-02-09', end='2016-02-16')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(daily_revenue)
plt.plot(forecast)
plt.title("Germany's Forecasted Daily Revenue in Feb 15 - Feb 22, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Actual', 'Forecast'])
plt.show()

In [None]:
# Group the data by day and calculate the sum of the revenue for each day
sales_feb = sales_data.loc['2016-02-15':'2016-02-22']
daily_revenue = sales_feb['Revenue'].resample('D').sum()

# Convert the NumPy array to a Python list
forecast_list = forecast.tolist()

# Round the values in the list using a list comprehension
forecast_rounded = [round(value, 0) for value in forecast_list]

# Convert the rounded list back to a NumPy array
forecast_rounded = np.array(forecast_rounded)

# Print Forecasted 'Revenue' for February 15 - February 22, 2016
print('Forecasted Revenue for February 15 - February 22, 2016 is:', forecast_rounded, '\n')

# Print Actual 'Revenue' for February 15 - February 22, 2016 
print('Actual Revenue for February 15 - February 22, 2016 is:', daily_revenue.values)

In [None]:
# Plot the forecasted and actual revenue data
plt.figure(figsize=(12, 6))
plt.plot(forecast)
plt.plot(daily_revenue)
plt.title("Germany's Forecasted and Actual Revenue in February 15-28, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Forecast', 'Actual'])
plt.show()

In [None]:
# Calculate the mae, mse, rmse, and r2 of the model
mae = mean_absolute_error(daily_revenue, forecast)
mse = mean_squared_error(daily_revenue, forecast)
rmse = np.sqrt(mse)
r2 = r2_score(daily_revenue, forecast)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R2:', r2)

#### Germany's Sales Data: February 1, 2015 - February 28, 2015 

In [None]:
# Read the data
sales_data = pd.read_csv(r'../Raw_Data/SalesForCourse_quizz_table.csv')

# Remove NaN values from the data
sales_data[sales_data.isnull().any(axis=1)]
sales_data.drop(34866, inplace=True)

# Convert the "Date" column into a datetime format and set it as the index for the dataframe
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data.set_index('Date', inplace=True)

# Filter the dataframe to only include data from Germany
sales_data = sales_data.loc[sales_data['Country'] == 'Germany']

# Group the data by day and calculate the sum of the revenue for each day
sales_sum = sales_data.loc['2015-02-01':'2015-02-28']
daily_revenue = sales_sum['Revenue'].resample('D').sum() 
daily_revenue

In [None]:
model = SimpleExpSmoothing(daily_revenue)
model_fit = model.fit()
forecast = model_fit.predict(start='2016-02-15', end='2016-02-22')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(daily_revenue)
plt.plot(forecast)
plt.title("Germany's Forecasted Daily Revenue in Feb 15 - Feb 22, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Actual', 'Forecast'])
plt.show()

In [None]:
# Group the data by day and calculate the sum of the revenue for each day
sales_feb = sales_data.loc['2016-02-15':'2016-02-22']
daily_revenue = sales_feb['Revenue'].resample('D').sum()

# Convert the NumPy array to a Python list
forecast_list = forecast.tolist()

# Round the values in the list using a list comprehension
forecast_rounded = [round(value, 0) for value in forecast_list]

# Convert the rounded list back to a NumPy array
forecast_rounded = np.array(forecast_rounded)

# Print Forecasted 'Revenue' for February 15 - February 22, 2016
print('Forecasted Revenue for February 15 - February 22, 2016 is:', forecast_rounded, '\n')

# Print Actual 'Revenue' for February 15 - February 22, 2016 
print('Actual Revenue for February 15 - February 22, 2016 is:', daily_revenue.values)

In [None]:
# Plot the forecasted and actual revenue data
plt.figure(figsize=(12, 6))
plt.plot(forecast)
plt.plot(daily_revenue)
plt.title("Germany's Forecasted and Actual Revenue in February 15-28, 2016")
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend(['Forecast', 'Actual'])
plt.show()

In [None]:
# Calculate the mae, mse, rmse, and r2 of the model
mae = mean_absolute_error(daily_revenue, forecast)
mse = mean_squared_error(daily_revenue, forecast)
rmse = np.sqrt(mse)
r2 = r2_score(daily_revenue, forecast)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R2:', r2)