In [228]:
import pickle
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from scipy.signal import detrend
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

# Data Processing 

In [229]:
sales_data = pd.read_csv('Data/sales dataset.csv')
sales_data['salesDate'] = pd.to_datetime(sales_data['salesDate'], infer_datetime_format=True)
selected_columns1 =['salesAmount', 'salesDate']
sales_data = sales_data[selected_columns1]
##########################################################################
expenses_data = pd.read_csv('Data/expenses dataset.csv')
expenses_data['expenseDate'] = pd.to_datetime(expenses_data['expenseDate'], infer_datetime_format=True)
selected_columns =['expenseAmount', 'expenseDate']
expenses_data = expenses_data[selected_columns]

In [230]:
month_map = {
    1:'January', 2:'Febuary', 3:'March', 4:'April',
    5:'May', 6:'June', 7:'July', 8:'August',
    9:'September', 10:'October', 11:'November', 12:'December' 
}

def groupMonth(df):
    df['Year'] = df.iloc[:,1].dt.year
    df['Month'] = df.iloc[:,1].dt.month
    df['Amount'] = df.iloc[:,0]
    monthly = df.groupby(['Year','Month'])['Amount'].sum().reset_index()
    return monthly

def monthlyToSingleDate(df):
    df['date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month'].astype(str))
    df['Month'] = df['Month'].map(month_map)
    
    return df

In [231]:
em = groupMonth(expenses_data)
sm = groupMonth(sales_data)
monthly_expenses = monthlyToSingleDate(em)
monthly_sales = monthlyToSingleDate(sm)

In [232]:
monthlySales = monthly_sales
monthlyExpenses = monthly_expenses

In [233]:
category1 = monthlyExpenses
category = category1
category

Unnamed: 0,Year,Month,Amount,date
0,2020,May,462850.0,2020-05-01
1,2020,June,958734.0,2020-06-01
2,2020,July,767630.0,2020-07-01
3,2020,August,1233400.0,2020-08-01
4,2020,September,2379730.0,2020-09-01
5,2020,October,2018480.0,2020-10-01
6,2020,November,2115684.0,2020-11-01
7,2020,December,1776590.0,2020-12-01
8,2021,January,2402525.0,2021-01-01
9,2021,Febuary,2576390.0,2021-02-01


# Remove Outliers

In [234]:
def remove_outliers_zscore(data, threshold=3):
    z_scores = np.abs((data - np.mean(data)) / np.std(data))
    filtered_data = data[(z_scores < threshold)]
    return filtered_data

filteredCategory = remove_outliers_zscore(category['Amount'])
category['Amount'] = filteredCategory
category = category.tail(12)

In [235]:
category

Unnamed: 0,Year,Month,Amount,date
31,2022,December,2462150.0,2022-12-01
32,2023,January,4652250.0,2023-01-01
33,2023,Febuary,1050750.0,2023-02-01
34,2023,March,546350.0,2023-03-01
35,2023,April,1654105.0,2023-04-01
36,2023,May,2195350.0,2023-05-01
37,2023,June,1529350.0,2023-06-01
38,2023,July,1549450.0,2023-07-01
39,2023,August,2304875.0,2023-08-01
40,2023,September,3404500.0,2023-09-01


# Encoding 

In [236]:
def onehot(df):
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    df = pd.get_dummies(df, columns=['Month'], prefix='Month').dropna()
    return df

def ordinal(df):
    df['day_of_the_week_encoded'] = df['day_of_week'].map(day_mapping)
    selected_columns = ['Amount','day_of_the_week_encoded']
    df = df[selected_columns].dropna()
    return df

In [237]:
category = onehot(category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])


# Adding Lagged Variables

In [238]:
lags = [1,2,3,4]
for lag in lags:
    category[f'lag{lag}'] = category['Amount'].shift(lag)
category.dropna(inplace=True)
selected_columns = ['Amount', 'lag1', 'lag2', 'lag3','lag4','Month_January','Month_Febuary','Month_March','Month_April','Month_May','Month_June','Month_July','Month_August','Month_September','Month_October','Month_November','Month_December']
# selected_columns = ['Amount', 'lag1', 'lag2', 'lag3','day_of_the_week_encoded']
category = category[selected_columns]

In [239]:
len(category)

8

In [240]:
scaler = StandardScaler()
X = scaler.fit_transform(category.drop(category.columns[0], axis=1))
y = scaler.fit_transform(np.array(category.iloc[:,0]).reshape(-1, 1))

# PCA

In [241]:
# n_components = 10
# pca = PCA(n_components=n_components)
# X = pca.fit_transform(X)

# Train, Test, Split

In [242]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Fit the model

In [243]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=250, random_state=42)
rf_regressor.fit(X_train, y_train)

  rf_regressor.fit(X_train, y_train)


In [244]:
X_test

array([[ 1.8075175 ,  0.64734527, -0.33379364, -0.36583624,  0.        ,
         0.        ,  0.        , -0.37796447, -0.37796447, -0.37796447,
        -0.37796447, -0.37796447, -0.37796447,  2.64575131, -0.37796447,
         0.        ],
       [ 0.7372479 ,  2.00184828,  0.31969742, -0.34855951,  0.        ,
         0.        ,  0.        , -0.37796447, -0.37796447, -0.37796447,
        -0.37796447, -0.37796447, -0.37796447, -0.37796447,  2.64575131,
         0.        ]])

In [245]:
y_pred = rf_regressor.predict(X_test)

In [246]:
y_pred

array([0.53271945, 0.52856743])

# Evaluate Model

In [247]:
mse = mean_squared_error(y_test, y_pred)
rmse =  np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error : {mse}')
print(f'Mean Absolute Error : {mae}')

Mean Squared Error : 2.811763835107986
Mean Absolute Error : 1.281608694343027


# Cross Validation

In [248]:
from sklearn.model_selection import cross_val_score

rf_regressor = RandomForestRegressor(n_estimators=250, random_state=42)
scores = cross_val_score(rf_regressor, X_train, y_train, cv=4, scoring='neg_mean_squared_error')

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [249]:
print("Mean squared error:", -scores.mean())

Mean squared error: 1.5057927050455429


In [845]:
# data = {
#     'lag1': [1120984.0],
#     'lag2': [1802021.0],
#     'lag3': [1168275.0],
#     'lag4': [1158278.0],
#     'day_Friday': [0],
#     'day_Monday': [0],
#     'day_Saturday': [1],
#     'day_Thursday': [0],
#     'day_Tuesday': [0],
#     'day_Wednesday': [0]
# }

data = {[1120984.0] ,[1802021.0] ,[1168275.0] ,[1158278.0] ,[0], [1], [0], [0], [0]}

df = pd.DataFrame(data)
predict = rf_regressor.predict(df)
predict = predict.reshape(-1, 1)
predict = scaler.inverse_transform(predict)
predict

TypeError: unhashable type: 'list'