# Train-Test data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_excel("Data.xlsx")

In [2]:
data.shape

(1707, 52)

In [3]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [4]:
data.fillna(0, inplace=True)
data[:] = np.nan_to_num(data)

In [5]:
data.isnull().sum()

Day                            0
Before vacation                0
Yarane                         0
Transaction weight (Alborz)    0
Month_Aban                     0
Month_Azar                     0
Month_Bahman                   0
Month_Dey                      0
Month_Esfand                   0
Month_Farvardin                0
Month_Khordad                  0
Month_Mehr                     0
Month_Mordad                   0
Month_Ordibehesht              0
Month_Shahrivar                0
Month_Tir                      0
Week-day_Friday                0
Week-day_Monday                0
Week-day_Saturday              0
Week-day_Sunday                0
Week-day_Thursday              0
Week-day_Tuesday               0
Week-day_Wednesday             0
Week-number_Final              0
Week-number_First              0
Week-number_Fourth             0
Week-number_Second             0
Week-number_Third              0
Label1_12 Farvardin            0
Label1_13 bedar                0
Label1_14 

# Regression

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

from collections import Counter
from IPython.core.display import display, HTML
sns.set_style('darkgrid')

In [7]:
# define the data/predictors as the pre-set feature names  
X = data.drop('Transaction weight (Alborz)',axis=1)

# Put the target (weight) in another DataFrame
y = data['Transaction weight (Alborz)']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 25)

In [9]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)

Shape of X_train:  (1365, 51)
Shape of X_test:  (342, 51)
Shape of y_train:  (1365,)
Shape of y_test (342,)


## Random Forest

In [10]:
# Fitting the Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 300, bootstrap=True, max_features='auto', min_samples_split=8, random_state = 0)
regressor_rf.fit(X_train, y_train)

RandomForestRegressor(min_samples_split=8, n_estimators=300, random_state=0)

In [11]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


# Predicting R2 Score the Train set results
y_pred_rf_train = regressor_rf.predict(X_train)
r2_score_rf_train = r2_score(y_train, y_pred_rf_train)

# Predicting R2 Score the Test set results
y_pred_rf_test = regressor_rf.predict(X_test)
r2_score_rf_test = r2_score(y_test, y_pred_rf_test)

# Predicting MSE and RMSE the Test set results

mse_rf= mean_squared_error(y_test, y_pred_rf_test)
rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))

print('R2_score (train): ', r2_score_rf_train)
print('R2_score (test): ', r2_score_rf_test)
print("RMSE: ", rmse_rf)
print("MSE: ", mse_rf)

R2_score (train):  0.9000254126533759
R2_score (test):  0.7546482918463253
RMSE:  0.0020745345824842113
MSE:  4.303693733922941e-06


## Prediction for Azar to Esfand

In [12]:
new_data = pd.read_excel("Data-Validation.xlsx")
new_data.head()

Unnamed: 0,Day,Before vacation,Yarane,Month_Aban,Month_Azar,Month_Bahman,Month_Dey,Month_Esfand,Month_Farvardin,Month_Khordad,...,Label1_Tasua,Label1_Vafat-Emam1,Label1_Vafat-Emam11,Label1_Vafat-Emam2,Label1_Vafat-Emam6,Label1_Vafat-Emam8,Label1_Vafat-Zahra,Label1_Veladat-Emam1,Label1_Veladat-Emam12,Label1_Veladat-Emam6
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
new_data.shape

(119, 51)

In [14]:
new_data.isnull().sum()

Day                      0
Before vacation          0
Yarane                   0
Month_Aban               0
Month_Azar               0
Month_Bahman             0
Month_Dey                0
Month_Esfand             0
Month_Farvardin          0
Month_Khordad            0
Month_Mehr               0
Month_Mordad             0
Month_Ordibehesht        0
Month_Shahrivar          0
Month_Tir                0
Week-day_Friday          0
Week-day_Monday          0
Week-day_Saturday        0
Week-day_Sunday          0
Week-day_Thursday        0
Week-day_Tuesday         0
Week-day_Wednesday       0
Week-number_Final        0
Week-number_First        0
Week-number_Fourth       0
Week-number_Second       0
Week-number_Third        0
Label1_12 Farvardin      0
Label1_13 bedar          0
Label1_14 Khordad        0
Label1_15 Khordad        0
Label1_22 Bahman         0
Label1_Arbain            0
Label1_Ashura            0
Label1_Fetr              0
Label1_Fetr2             0
Label1_Ghadir            0
L

In [15]:
new_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [16]:
new_data.fillna(0, inplace=True)
new_data[:] = np.nan_to_num(new_data)

In [17]:
# Predicting R2 Score the Train set results
y_pred_rf_new = regressor_rf.predict(new_data)
y_pred_rf_new

array([0.03330619, 0.03386118, 0.036457  , 0.02706853, 0.03377846,
       0.03316123, 0.03295828, 0.03255076, 0.0331077 , 0.03549668,
       0.02732015, 0.0336193 , 0.03301872, 0.03333255, 0.0331552 ,
       0.03336018, 0.03557587, 0.02662714, 0.0335877 , 0.03367775,
       0.03389331, 0.03374901, 0.03402247, 0.03550885, 0.02877986,
       0.03439737, 0.03657259, 0.03682142, 0.03707518, 0.03727138,
       0.03675365, 0.02765809, 0.03510386, 0.03389669, 0.03494938,
       0.02916871, 0.03414707, 0.03672847, 0.02775582, 0.03462225,
       0.03454825, 0.03430472, 0.03381798, 0.03424699, 0.03599595,
       0.02781711, 0.0339988 , 0.03315626, 0.0336512 , 0.03397585,
       0.03485151, 0.03622611, 0.02809668, 0.03432324, 0.03324004,
       0.03345727, 0.03350716, 0.03366634, 0.03577235, 0.02920632,
       0.03443119, 0.03532519, 0.03523165, 0.03427964, 0.03440435,
       0.03526591, 0.02814858, 0.03416203, 0.03353572, 0.03361663,
       0.0341154 , 0.03421407, 0.03607293, 0.02776772, 0.03197

In [18]:
df = pd.DataFrame(y_pred_rf_new)
df.head()

Unnamed: 0,0
0,0.033306
1,0.033861
2,0.036457
3,0.027069
4,0.033778


In [19]:
df.to_excel("Data-Prediction.xlsx", index=False)

# # OLS regression

In [20]:
import pandas as pd
data=pd.read_excel("Data.xlsx")

In [21]:
data.head()

Unnamed: 0,Day,Before vacation,Yarane,Transaction weight (Alborz),Month_Aban,Month_Azar,Month_Bahman,Month_Dey,Month_Esfand,Month_Farvardin,...,Label1_Tasua,Label1_Vafat-Emam1,Label1_Vafat-Emam11,Label1_Vafat-Emam2,Label1_Vafat-Emam6,Label1_Vafat-Emam8,Label1_Vafat-Zahra,Label1_Veladat-Emam1,Label1_Veladat-Emam12,Label1_Veladat-Emam6
0,1,1,0,0.02339,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0.023792,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0.022789,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0.024508,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0.026897,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [22]:
import numpy as np
from numpy import log, log1p

import pandas as pd

import scipy.stats as stats
from scipy.stats import shapiro,boxcox,yeojohnson
from scipy.stats import boxcox

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [23]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split,ShuffleSplit,GridSearchCV,cross_val_score,cross_val_predict

In [24]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [25]:
X=data.drop(["Transaction weight (Alborz)"],axis=1)
y=data["Transaction weight (Alborz)"] 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
lm=sm.OLS(y_train,X_train)
model=lm.fit()
model.summary()

0,1,2,3
Dep. Variable:,Transaction weight (Alborz),R-squared:,0.703
Model:,OLS,Adj. R-squared:,0.692
Method:,Least Squares,F-statistic:,64.78
Date:,"Fri, 27 Jan 2023",Prob (F-statistic):,3.07e-308
Time:,12:50:38,Log-Likelihood:,6479.4
No. Observations:,1365,AIC:,-12860.0
Df Residuals:,1316,BIC:,-12610.0
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Day,9.886e-05,3.08e-05,3.210,0.001,3.84e-05,0.000
Before vacation,0.0007,0.000,2.806,0.005,0.000,0.001
Yarane,-0.0002,0.000,-0.516,0.606,-0.001,0.001
Month_Aban,0.0065,0.000,29.188,0.000,0.006,0.007
Month_Azar,0.0063,0.000,26.079,0.000,0.006,0.007
Month_Bahman,0.0065,0.000,27.750,0.000,0.006,0.007
Month_Dey,0.0063,0.000,27.256,0.000,0.006,0.007
Month_Esfand,0.0072,0.000,29.433,0.000,0.007,0.008
Month_Farvardin,0.0070,0.000,29.981,0.000,0.007,0.007

0,1,2,3
Omnibus:,169.964,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1519.167
Skew:,0.198,Prob(JB):,0.0
Kurtosis:,8.153,Cond. No.,1e+16


In [26]:
Feature_weight=model.params
Feature_weight

Day                      0.000099
Before vacation          0.000730
Yarane                  -0.000180
Month_Aban               0.006481
Month_Azar               0.006278
Month_Bahman             0.006480
Month_Dey                0.006287
Month_Esfand             0.007199
Month_Farvardin          0.007029
Month_Khordad            0.005728
Month_Mehr               0.006703
Month_Mordad             0.005685
Month_Ordibehesht        0.005432
Month_Shahrivar          0.005948
Month_Tir                0.005309
Week-day_Friday          0.005258
Week-day_Monday          0.010916
Week-day_Saturday        0.011815
Week-day_Sunday          0.011024
Week-day_Thursday        0.012897
Week-day_Tuesday         0.011000
Week-day_Wednesday       0.011649
Week-number_Final        0.014822
Week-number_First        0.015987
Week-number_Fourth       0.014654
Week-number_Second       0.014857
Week-number_Third        0.014238
Label1_12 Farvardin     -0.004074
Label1_13 bedar         -0.018276
Label1_14 Khor

In [27]:
Sorted_Feature_weight=Feature_weight.sort_values(ascending=False)
Sorted_Feature_weight

Week-number_First        0.015987
Week-number_Second       0.014857
Week-number_Final        0.014822
Week-number_Fourth       0.014654
Week-number_Third        0.014238
Week-day_Thursday        0.012897
Week-day_Saturday        0.011815
Week-day_Wednesday       0.011649
Week-day_Sunday          0.011024
Week-day_Tuesday         0.011000
Week-day_Monday          0.010916
Label1_Naft              0.010320
Month_Esfand             0.007199
Month_Farvardin          0.007029
Month_Mehr               0.006703
Month_Aban               0.006481
Month_Bahman             0.006480
Month_Dey                0.006287
Month_Azar               0.006278
Month_Shahrivar          0.005948
Month_Khordad            0.005728
Month_Mordad             0.005685
Month_Ordibehesht        0.005432
Month_Tir                0.005309
Week-day_Friday          0.005258
Before vacation          0.000730
Day                      0.000099
Yarane                  -0.000180
Label1_Veladat-Emam6    -0.002464
Label1_Vafat-E