In [702]:
import pickle
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from scipy.signal import detrend
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

# Data Processing 

In [783]:
sales_data = pd.read_csv('Data/sales dataset.csv')
sales_data['salesDate'] = pd.to_datetime(sales_data['salesDate'], infer_datetime_format=True)
selected_columns1 =['salesAmount', 'salesDate']
sales_data = sales_data[selected_columns1]
##########################################################################
expenses_data = pd.read_csv('Data/expenses dataset.csv')
expenses_data['expenseDate'] = pd.to_datetime(expenses_data['expenseDate'], infer_datetime_format=True)
selected_columns =['expenseAmount', 'expenseDate']
expenses_data = expenses_data[selected_columns]

In [784]:
# The function groups either the expense or sales data into the daily series
# But the first column must be the amount and the second column must be the dates in datetime format
def groupDay(df):
    df['Year'] = df.iloc[:,1].dt.year
    df['Month'] = df.iloc[:,1].dt.month
    df['Day'] = df.iloc[:,1].dt.day
    df['Amount'] = df.iloc[:,0]
    daily = df.groupby(['Year','Month','Day'])['Amount'].sum().reset_index()
    return daily

def dailyToSingleDate(df,Year, Month, Day):
    days = []
    df['date'] = pd.to_datetime(df[['Year','Month','Day']])
    df = df.drop(['Year','Month','Day'], axis=1)
    
    for date in df['date']:
        day_name = date.day_name()
    
        if day_name == 'Sunday':
            days.append('Monday')
        else:
            days.append(day_name)
    df['day_of_week'] = days
#     df = df.set_index('date')   
    return df

In [785]:
sd = groupDay(sales_data)
daily_sales = dailyToSingleDate(sd, sd['Year'], sd['Month'], sd['Day'])
ed = groupDay(expenses_data)
daily_expenses = dailyToSingleDate(ed, ed['Year'], ed['Month'], ed['Day'])

In [786]:
dailySales = daily_sales[700:]    #705 for the removal of irrelevant data points
dailyExpenses = daily_expenses[800:]  #890 for the removal of irrelevant data points
# monthlySales = monthly_sales
# monthlyExpenses = monthly_expenses

In [787]:
category1 = dailySales
category = category1
category

Unnamed: 0,Amount,date,day_of_week
700,2164368,2023-02-01,Wednesday
701,1361768,2023-02-02,Thursday
702,1687995,2023-02-03,Friday
703,1236633,2023-02-04,Saturday
704,2183193,2023-02-06,Monday
...,...,...,...
936,1396202,2023-11-06,Monday
937,1441488,2023-11-07,Tuesday
938,1867192,2023-11-08,Wednesday
939,1401098,2023-11-09,Thursday


# Remove Outliers

In [788]:
def remove_outliers_zscore(data, threshold=3):
    z_scores = np.abs((data - np.mean(data)) / np.std(data))
    filtered_data = data[(z_scores < threshold)]
    return filtered_data

filteredCategory = remove_outliers_zscore(category['Amount'])
category['Amount'] = filteredCategory
category = category.tail(35)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category['Amount'] = filteredCategory


In [789]:
category

Unnamed: 0,Amount,date,day_of_week
906,1061494.0,2023-10-02,Monday
907,1585790.0,2023-10-03,Tuesday
908,1784118.0,2023-10-04,Wednesday
909,1380207.0,2023-10-05,Thursday
910,1232128.0,2023-10-06,Friday
911,1224978.0,2023-10-07,Saturday
912,1158278.0,2023-10-09,Monday
913,1168275.0,2023-10-10,Tuesday
914,1802021.0,2023-10-11,Wednesday
915,1120984.0,2023-10-12,Thursday


# Encoding 

In [790]:
day_mapping = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6
}

def onehot(df):
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    df = pd.get_dummies(df, columns=['day_of_week'], prefix='day').dropna()
    return df

def ordinal(df):
    df['day_of_the_week_encoded'] = df['day_of_week'].map(day_mapping)
    selected_columns = ['Amount','day_of_the_week_encoded']
    df = df[selected_columns].dropna()
    return df

In [791]:
category = onehot(category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])


In [792]:
category

Unnamed: 0_level_0,Amount,day_Friday,day_Monday,day_Saturday,day_Thursday,day_Tuesday,day_Wednesday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-10-02,1061494.0,0,1,0,0,0,0
2023-10-03,1585790.0,0,0,0,0,1,0
2023-10-04,1784118.0,0,0,0,0,0,1
2023-10-05,1380207.0,0,0,0,1,0,0
2023-10-06,1232128.0,1,0,0,0,0,0
2023-10-07,1224978.0,0,0,1,0,0,0
2023-10-09,1158278.0,0,1,0,0,0,0
2023-10-10,1168275.0,0,0,0,0,1,0
2023-10-11,1802021.0,0,0,0,0,0,1
2023-10-12,1120984.0,0,0,0,1,0,0


# Adding Lagged Variables

In [793]:
lags = [1,2,3,4]
for lag in lags:
    category[f'lag{lag}'] = category['Amount'].shift(lag)
category.dropna(inplace=True)
selected_columns = ['Amount', 'lag1', 'lag2', 'lag3','lag4','day_Friday','day_Monday','day_Saturday','day_Thursday','day_Tuesday','day_Wednesday']
# selected_columns = ['Amount', 'lag1', 'lag2', 'lag3','day_of_the_week_encoded']
category = category[selected_columns]

In [839]:
category

Unnamed: 0_level_0,Amount,lag1,lag2,lag3,lag4,day_Friday,day_Monday,day_Saturday,day_Thursday,day_Tuesday,day_Wednesday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-10-06,1232128.0,1380207.0,1784118.0,1585790.0,1061494.0,1,0,0,0,0,0
2023-10-07,1224978.0,1232128.0,1380207.0,1784118.0,1585790.0,0,0,1,0,0,0
2023-10-09,1158278.0,1224978.0,1232128.0,1380207.0,1784118.0,0,1,0,0,0,0
2023-10-10,1168275.0,1158278.0,1224978.0,1232128.0,1380207.0,0,0,0,0,1,0
2023-10-11,1802021.0,1168275.0,1158278.0,1224978.0,1232128.0,0,0,0,0,0,1
2023-10-12,1120984.0,1802021.0,1168275.0,1158278.0,1224978.0,0,0,0,1,0,0
2023-10-13,961789.0,1120984.0,1802021.0,1168275.0,1158278.0,1,0,0,0,0,0
2023-10-14,1102110.0,961789.0,1120984.0,1802021.0,1168275.0,0,0,1,0,0,0
2023-10-16,734380.0,1102110.0,961789.0,1120984.0,1802021.0,0,1,0,0,0,0
2023-10-17,1471970.0,734380.0,1102110.0,961789.0,1120984.0,0,0,0,0,1,0


In [795]:
scaler = StandardScaler()
X = scaler.fit_transform(category.drop(category.columns[0], axis=1))
y = scaler.fit_transform(np.array(category.iloc[:,0]).reshape(-1, 1))

# PCA

In [796]:
# n_components = 10
# pca = PCA(n_components=n_components)
# X = pca.fit_transform(X)

# Train, Test, Split

In [797]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Fit the model

In [841]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=250, random_state=42, criterion='absolute_error')
rf_regressor.fit(X_train, y_train)

  rf_regressor.fit(X_train, y_train)


In [833]:
X_test

array([[-1.41534307,  1.21071085,  0.58183133,  0.19344977, -0.5       ,
        -0.4472136 ,  2.23606798, -0.39223227, -0.4472136 , -0.4472136 ],
       [-1.59192633, -1.4199884 ,  1.24794136,  0.60569596, -0.5       ,
         2.23606798, -0.4472136 , -0.39223227, -0.4472136 , -0.4472136 ],
       [ 0.20356346, -1.59405276, -1.42339623,  1.27053872, -0.5       ,
        -0.4472136 , -0.4472136 , -0.39223227,  2.23606798, -0.4472136 ],
       [ 0.2929276 ,  0.17582494, -1.60014949, -1.39571668, -0.5       ,
        -0.4472136 , -0.4472136 , -0.39223227, -0.4472136 ,  2.23606798],
       [ 1.13298126,  0.26391433,  0.19706881, -1.57213367, -0.5       ,
        -0.4472136 , -0.4472136 ,  2.54950976, -0.4472136 , -0.4472136 ],
       [ 0.21322488,  1.09198489,  0.28651897,  0.22166544,  2.        ,
        -0.4472136 , -0.4472136 , -0.39223227, -0.4472136 , -0.4472136 ]])

In [834]:
y_pred = rf_regressor.predict(X_test)

In [835]:
y_pred

array([-0.54747263, -0.68465881, -0.31604928,  1.36472884,  1.41735722,
       -0.30178093])

# Evaluate Model

In [836]:
mse = mean_squared_error(y_test, y_pred)
rmse =  np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error : {mse}')
print(f'Mean Absolute Error : {mae}')

Mean Squared Error : 0.7981195615119177
Mean Absolute Error : 0.8333642137054401


# Cross Validation

In [837]:
from sklearn.model_selection import cross_val_score

rf_regressor = RandomForestRegressor(n_estimators=250, random_state=42)
scores = cross_val_score(rf_regressor, X_train, y_train, cv=3, scoring='neg_mean_squared_error')

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [838]:
print("Mean squared error:", -scores.mean())

Mean squared error: 1.3768443593643578


In [845]:
# data = {
#     'lag1': [1120984.0],
#     'lag2': [1802021.0],
#     'lag3': [1168275.0],
#     'lag4': [1158278.0],
#     'day_Friday': [0],
#     'day_Monday': [0],
#     'day_Saturday': [1],
#     'day_Thursday': [0],
#     'day_Tuesday': [0],
#     'day_Wednesday': [0]
# }

data = {[1120984.0] ,[1802021.0] ,[1168275.0] ,[1158278.0] ,[0], [1], [0], [0], [0]}

df = pd.DataFrame(data)
predict = rf_regressor.predict(df)
predict = predict.reshape(-1, 1)
predict = scaler.inverse_transform(predict)
predict

TypeError: unhashable type: 'list'