### **MLR Regression**

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go 
import seaborn as sns
import matplotlib.pyplot as plt  

In [2]:
table = pd.read_csv('https://raw.githubusercontent.com/morn12/tb/main/Tb1.csv',parse_dates=['Date'])
table.drop(['Unnamed: 0'] , axis=1 ,inplace=True)

In [3]:
regression = table.copy()
regression.drop(['Date', 'Hour'], axis=1, inplace=True)

In [4]:
regression = regression[['Temp',	'Is_Holiday',	'Is_Vac',	'Is_Weekend',	'Is_Cold','Is_HW','SMP']]
regression

Unnamed: 0,Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP
0,24.47,0,1,0,0,0,146.47
1,24.30,0,1,0,0,0,171.22
2,24.17,0,1,0,0,0,128.82
3,23.99,0,1,0,0,0,149.85
4,23.93,0,1,0,0,0,146.53
...,...,...,...,...,...,...,...
30708,12.26,0,0,0,0,0,115.59
30709,12.27,0,0,0,0,0,115.59
30710,12.06,0,0,0,0,0,115.59
30711,12.06,0,0,0,0,0,103.20


# נרמול עמודת הטמפרטורה

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler_TMP = MinMaxScaler()
scaled_Temp = pd.DataFrame(scaler_TMP.fit_transform(regression[['Temp']]))
regression_for_scale = regression.drop('Temp',axis=1)
scaled_df = pd.merge(left= scaled_Temp, right= regression_for_scale, left_index=True, right_index=True)

scaled_df.rename(columns={ 0: "Temp"},inplace = True)

scaled_df.head()

Unnamed: 0,Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP
0,0.606796,0,1,0,0,0,146.47
1,0.601638,0,1,0,0,0,171.22
2,0.597694,0,1,0,0,0,128.82
3,0.592233,0,1,0,0,0,149.85
4,0.590413,0,1,0,0,0,146.53


In [6]:
from pandas.core.accessor import register_dataframe_accessor
y = scaled_df.SMP
X = scaled_df.drop('SMP',axis=1)
X

Unnamed: 0,Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW
0,0.606796,0,1,0,0,0
1,0.601638,0,1,0,0,0
2,0.597694,0,1,0,0,0
3,0.592233,0,1,0,0,0
4,0.590413,0,1,0,0,0
...,...,...,...,...,...,...
30708,0.236347,0,0,0,0,0
30709,0.236650,0,0,0,0,0
30710,0.230279,0,0,0,0,0
30711,0.230279,0,0,0,0,0


# מפת חום וקורלציה

In [7]:
corr_df = regression.corr()
corr_df

Unnamed: 0,Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP
Temp,1.0,0.005164,0.50441,-0.033853,-0.343624,0.441425,0.249662
Is_Holiday,0.005164,1.0,-0.175301,-0.008804,-0.041865,-0.001942,0.009321
Is_Vac,0.50441,-0.175301,1.0,0.005302,-0.104231,0.240261,0.163111
Is_Weekend,-0.033853,-0.008804,0.005302,1.0,0.069529,-0.018164,-0.009389
Is_Cold,-0.343624,-0.041865,-0.104231,0.069529,1.0,-0.070105,0.065054
Is_HW,0.441425,-0.001942,0.240261,-0.018164,-0.070105,1.0,0.189516
SMP,0.249662,0.009321,0.163111,-0.009389,0.065054,0.189516,1.0


In [8]:
fig = go.Figure()
fig.add_traces(go.Heatmap(
    z=corr_df, 
    x=corr_df.columns, 
    y=corr_df.columns,
    zmax=1, 
    zmin=-1
))
fig.update_layout({
    'title':"Features Correlation Heatmap"
})


In [9]:
from plotly.subplots import make_subplots
rows = 1
cols = 1
c = ['blue', 'green', 'brown', 'red','pink','yellow']
fig = make_subplots(rows=rows, cols=cols, shared_yaxes=True, )
count=0
for i in range(rows):
  for j in range(cols):
    if count < 2:
      fig.add_trace(go.Scatter(
        y=y, 
        x=X.iloc[:,count],
        mode='markers',
        marker_color=c[count],
        marker_size=2.5,
        name=X.columns[count]
        ),row=i+1, col=j+1
      )
      fig.update_xaxes(title_text=X.columns[count], row=i+1, col=j+1)
    count+=1

fig.update_layout(height=600, width=600,
                  title_text="SMP by Temp")
fig.show()

In [10]:
# חלוקת הנתונים לסט אימון ולסט  מבחן
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
train_df = pd.merge(left=X_train, right=y_train, left_index=True, right_index=True)
test_df = pd.merge(left=X_test, right=y_test, left_index=True, right_index=True)
test_df.head(10)


Unnamed: 0,Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP
29440,0.339806,0,0,1,0,0,101.56
26386,0.311286,0,0,0,0,0,158.9
29001,0.273665,0,0,0,0,0,110.55
11833,0.394114,0,0,0,0,0,109.41
18029,0.651092,0,1,1,0,0,120.26
10497,0.243932,0,0,0,0,0,133.88
10982,0.304308,0,0,1,0,0,154.24
17570,0.475425,0,1,0,0,0,117.5
18202,0.678398,0,1,0,0,0,115.39
21176,0.649879,0,0,0,0,1,115.85


In [11]:
correlated = regression.columns.tolist() # מערך של עמודות הטבלה
correlated = correlated[:-1] # פיצרים ללא עמודת המטרה

# מודל רגרסיה לינארית רבת משתנים

In [12]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()  # הגדרת המודל
lm.fit(X_train,pd.DataFrame(y_train))              # אימון המודל
# מציאת המקדמים של הפיצרים
coeff = lm.coef_[0]
intercept = lm.intercept_[0]
print('Coefficients: \n', "coeff =", coeff , ",  Intercept=",intercept,"\n" )
print("The regression equation is:" ,"SMP = " ,intercept," + ",coeff[0]," * ",correlated[0]," + ",coeff[1]," * ",correlated[1]," + ",coeff[2]," * ",correlated[2], " + ",coeff[3]," * ",correlated[3]," + ",coeff[4]," * ",correlated[4]," + ",coeff[5]," * ",correlated[5])

# בעזרת הטטות ניתן לראות שפיצר הטמפרטורה הכי משפיע ומשמעותי

Coefficients: 
 coeff = [37.99628724  1.98340796  3.28929559 -0.61609614 25.1885799   7.6447824 ] ,  Intercept= 119.10933510841639 

The regression equation is: SMP =  119.10933510841639  +  37.996287242264295  *  Temp  +  1.983407958950991  *  Is_Holiday  +  3.2892955873802565  *  Is_Vac  +  -0.6160961403343066  *  Is_Weekend  +  25.18857989565859  *  Is_Cold  +  7.6447823999892535  *  Is_HW


In [13]:
fitted_SMP = lm.predict(X_train) # קבלת חיזויי האימון

predicted_train_SMP = round(pd.Series(fitted_SMP[:,0], index=y_train.index, name='Predicted_train_SMP'),ndigits=2)


In [14]:
train_df = pd.merge(left=train_df, right=predicted_train_SMP , left_index=True, right_index=True)
train_df.head(10)

Unnamed: 0,Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_train_SMP
24194,0.293083,0,0,0,0,0,113.66,130.25
1421,0.834345,0,1,0,0,1,198.68,161.75
10418,0.164745,0,0,0,0,0,139.48,125.37
22698,0.504551,0,0,1,0,0,158.0,137.66
15266,0.533677,0,0,0,0,0,117.81,139.39
9131,0.233617,0,0,0,0,0,153.83,127.99
16247,0.590716,0,0,0,0,0,118.41,141.55
8661,0.233313,1,0,1,0,0,138.95,129.34
10727,0.098301,0,0,0,1,0,187.61,148.03
11408,0.231189,0,0,0,0,0,186.03,127.89


In [15]:
fitted_SMP = lm.predict(X_test) # בדיקת סט המבחן
predicted_test_SMP = round(pd.Series(fitted_SMP[:,0], index=y_test.index, name='Predicted_test_SMP'),ndigits=2)
test_df = pd.merge(left=test_df, right=predicted_test_SMP, left_index=True, right_index=True)
test_df.head()

Unnamed: 0,Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_test_SMP
29440,0.339806,0,0,1,0,0,101.56,131.4
26386,0.311286,0,0,0,0,0,158.9,130.94
29001,0.273665,0,0,0,0,0,110.55,129.51
11833,0.394114,0,0,0,0,0,109.41,134.08
18029,0.651092,0,1,1,0,0,120.26,146.52


## הערכת המודל

In [16]:
train_df['residuals'] = train_df.Predicted_train_SMP - train_df.SMP
test_df['residuals'] = test_df.Predicted_test_SMP - test_df.SMP


In [17]:
#גרף הטעויות
fig= go.Figure()
fig.add_trace(
    go.Scatter(
        x=train_df.Predicted_train_SMP,
        y=train_df.residuals,
        mode='markers',
        name='train residuals',
        marker_color='blue',
        marker_size=1.5,
        marker_line_width=0,
    )
)
fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals,
      mode='markers',
      name='test residuals',
      marker_color='red',
      marker_size=1.5,
      marker_line_width=0,
  )
)

fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals*0,
      mode='lines',
      name='zero line',
      marker_color='black',
      marker_size=1.5,
      marker_line_width=0,

  )
)
fig.update_layout(
    title="Residuals of Predicted SMP",
    xaxis_title="Predicted SMP",
    yaxis_title="Residuals",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)
fig.show()

In [18]:
from sklearn import metrics

print("------ TRAIN DATA ------")
print("MSE:",metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP)))
print("MAE:",metrics.mean_absolute_error(train_df.SMP, train_df.Predicted_train_SMP))
print("Train STD:",train_df['SMP'].std())
# שגיאות ממוצעות של המודל
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TRAIN DATA ------
MSE: 813.8769861086562
RMSE: 28.52852933658965
MAE: 23.736631006093305
Train STD: 29.99146412782013


In [19]:
std = train_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט האימון יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
    # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [20]:
print("------ TEST DATA ------")
print("MSE:",metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP)))
print("MAE:",metrics.mean_absolute_error(test_df.SMP, test_df.Predicted_test_SMP))
print("Test STD:",test_df['SMP'].std())
# הערכים זהים בסט האימון ובסט המבחן מה שמראה על מודל טוב
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TEST DATA ------
MSE: 816.6370233828956
RMSE: 28.57686167833857
MAE: 23.895392880399392
Test STD: 30.021775985783947


In [21]:
std = test_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט המבחן יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
    # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [22]:
from sklearn.metrics import r2_score
r2_score(train_df.SMP, train_df.Predicted_train_SMP)

0.09513532555942061

In [23]:

r2_score(test_df.SMP, test_df.Predicted_test_SMP)


0.09384301596097488

#מודל קיץ

In [24]:

summer = table.loc[((table['Date'] >= '2019-04-01') & (table['Date'] < '2019-10-01')) | ((table['Date'] >= '2020-04-01') & (table['Date'] < '2020-10-01'))]
summer_index = summer['Date']



In [25]:
summer.drop(['Date', 'Hour'], axis=1, inplace=True)

scaler = MinMaxScaler()
scaler_TMP = MinMaxScaler()
scaled_Temp = pd.DataFrame(scaler_TMP.fit_transform(regression[['Temp']]))
regression_for_scale = regression.drop('Temp',axis=1)
scaled_df = pd.merge(left= scaled_Temp, right= summer, left_index=True, right_index=True)

scaled_df.rename(columns={0: "Scaled_Temp"},inplace = True)
scaled_df.drop('Temp',axis=1,inplace=True)

scaled_df.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Scaled_Temp,SMP,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW
0,0.606796,146.47,0,1,0,0,0
1,0.601638,171.22,0,1,0,0,0
2,0.597694,128.82,0,1,0,0,0
3,0.592233,149.85,0,1,0,0,0
4,0.590413,146.53,0,1,0,0,0


In [26]:
y = scaled_df.SMP
X = scaled_df.drop('SMP',axis=1)
X

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW
0,0.606796,0,1,0,0,0
1,0.601638,0,1,0,0,0
2,0.597694,0,1,0,0,0
3,0.592233,0,1,0,0,0
4,0.590413,0,1,0,0,0
...,...,...,...,...,...,...
21972,0.607706,0,0,0,0,1
21973,0.605886,0,0,0,0,1
21974,0.598604,0,0,0,0,1
21975,0.592233,0,0,0,0,1


In [27]:
# חלוקת הנתונים לסט אימון ולסט  מבחן
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
train_df = pd.merge(left=X_train, right=y_train, left_index=True, right_index=True)
test_df = pd.merge(left=X_test, right=y_test, left_index=True, right_index=True)
test_df.head(10)

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP
21504,0.583131,0,0,0,0,0,114.99
1575,0.622573,0,1,1,0,0,194.61
18070,0.729369,0,1,1,0,0,120.26
1057,0.589199,0,1,0,0,1,196.81
14974,0.381068,0,0,1,0,0,118.74
21738,0.567961,0,0,1,0,0,118.23
15302,0.603762,0,0,0,0,0,118.82
367,0.74818,0,1,0,0,0,199.78
19955,0.590716,0,1,0,0,0,166.37
15652,0.615595,0,0,1,0,0,149.44


In [28]:
correlated = scaled_df.columns.tolist() # מערך של עמודות הטבלה
correlated = correlated[:-1] # פיצרים ללא עמודת המטרה

## הרצת המודל

In [29]:
lm = LinearRegression()  # הגדרת מודל
lm.fit(X_train,pd.DataFrame(y_train))              # אימון מודל
# מציאת מקדמים
coeff = lm.coef_[0]
intercept = lm.intercept_[0]
print('Coefficients: \n', "coeff =", coeff , ",  Intercept=",intercept,"\n" )
print("The regression equation is:" ,"SMP = " ,intercept," + ",coeff[0]," * ",correlated[0]," + ",coeff[1]," * ",correlated[1]," + ",coeff[2]," * ",correlated[2], " + ",coeff[3]," * ",correlated[3]," + ",coeff[4]," * ",correlated[4]," + ",coeff[5]," * ",correlated[5])
# פיצר הטמפרטורה הכי משמעותי לפי המקדם

Coefficients: 
 coeff = [78.17930831  2.0726804   2.4979562   0.22425374  0.          6.2522566 ] ,  Intercept= 93.46216670248303 

The regression equation is: SMP =  93.46216670248303  +  78.17930830934417  *  Scaled_Temp  +  2.0726804028517187  *  SMP  +  2.4979562034138154  *  Is_Holiday  +  0.224253742131228  *  Is_Vac  +  0.0  *  Is_Weekend  +  6.252256596416721  *  Is_Cold


In [30]:
fitted_SMP = lm.predict(X_train) # בדיקת חיזוי האימון

predicted_train_SMP = round(pd.Series(fitted_SMP[:,0], index=y_train.index, name='Predicted_train_SMP'),ndigits=2)

In [31]:
train_df = pd.merge(left=train_df, right=predicted_train_SMP , left_index=True, right_index=True)
train_df.head(10)

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_train_SMP
2960,0.682646,0,1,1,0,1,212.7,155.81
18899,0.649879,0,1,0,0,1,163.44,153.02
3392,0.633192,0,0,0,0,0,200.43,142.96
14768,0.385922,0,0,0,0,0,122.62,123.63
17867,0.674757,0,1,0,0,0,119.52,148.71
17531,0.608617,0,1,0,0,0,119.53,143.54
4110,0.590413,0,0,0,0,0,138.89,139.62
15465,0.856189,0,0,0,0,1,151.31,166.65
17660,0.547027,0,1,1,0,0,122.99,138.95
915,0.550061,0,1,1,0,0,196.81,139.19


In [32]:
fitted_SMP = lm.predict(X_test) 
predicted_test_SMP = round(pd.Series(fitted_SMP[:,0], index=y_test.index, name='Predicted_test_SMP'),ndigits=2)
test_df = pd.merge(left=test_df, right=predicted_test_SMP, left_index=True, right_index=True)
test_df.head()

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_test_SMP
21504,0.583131,0,0,0,0,0,114.99,139.05
1575,0.622573,0,1,1,0,0,194.61,144.86
18070,0.729369,0,1,1,0,0,120.26,153.21
1057,0.589199,0,1,0,0,1,196.81,148.28
14974,0.381068,0,0,1,0,0,118.74,123.48


In [33]:
train_df['residuals'] = train_df.Predicted_train_SMP - train_df.SMP
test_df['residuals'] = test_df.Predicted_test_SMP - test_df.SMP


In [34]:
#גרף הטעויות
fig= go.Figure()
fig.add_trace(
    go.Scatter(
        x=train_df.Predicted_train_SMP,
        y=train_df.residuals,
        mode='markers',
        name='train residuals',
        marker_color='blue',
        marker_size=1.5,
        marker_line_width=0,
    )
)
fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals,
      mode='markers',
      name='test residuals',
      marker_color='red',
      marker_size=1.5,
      marker_line_width=0,
  )
)

fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals*0,
      mode='lines',
      name='zero line',
      marker_color='black',
      marker_size=1.5,
      marker_line_width=0,

  )
)
fig.update_layout(
    title="Residuals of Predicted SMP",
    xaxis_title="Predicted SMP",
    yaxis_title="Residuals",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)
fig.show()

## הערכת המודל

In [35]:
print("------ TRAIN DATA ------")
print("MSE:",metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP)))
print("MAE:",metrics.mean_absolute_error(train_df.SMP, train_df.Predicted_train_SMP))
print("Train STD:",train_df['SMP'].std())
# שגיאות ממוצעות של המודל
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TRAIN DATA ------
MSE: 890.6952969409854
RMSE: 29.84451870848289
MAE: 23.611907958852193
Train STD: 32.347550009026264


In [36]:
std = train_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט האימון יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
    # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [37]:
print("------ TEST DATA ------")
print("MSE:",metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP)))
print("MAE:",metrics.mean_absolute_error(test_df.SMP, test_df.Predicted_test_SMP))
print("Test STD:",test_df['SMP'].std())
# הערכים זהים בסט האימון ובסט המבחן מה שמראה על מודל טוב
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TEST DATA ------
MSE: 847.6177279055079
RMSE: 29.113875178435244
MAE: 22.979183931278424
Test STD: 31.477186841220988


In [38]:
std = test_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט המבחן יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
  # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [39]:

r2_score(train_df.SMP, train_df.Predicted_train_SMP)

0.14867888962756837

In [40]:
r2_score(test_df.SMP, test_df.Predicted_test_SMP)

0.14430706330685683

#מודל חורף


In [41]:
winter = table.loc[~table.Date.isin(summer_index)]

In [42]:
winter.drop(['Date', 'Hour'], axis=1, inplace=True)

scaler = MinMaxScaler()
scaler_TMP = MinMaxScaler()
scaled_Temp = pd.DataFrame(scaler_TMP.fit_transform(regression[['Temp']]))
scaled_SMP = pd.DataFrame(scaler.fit_transform(regression[['SMP']]))
regression_for_scale = regression.drop('Temp',axis=1)
scaled_df = pd.merge(left= scaled_Temp, right= winter, left_index=True, right_index=True)

scaled_df.rename(columns={0: "Scaled_Temp"},inplace = True)
scaled_df.drop('Temp',axis=1,inplace=True)
scaled_df.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Scaled_Temp,SMP,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW
4409,0.573726,153.0,1,0,0,0,0
4410,0.567658,146.47,1,0,0,0,0
4411,0.560983,171.22,1,0,0,0,0
4412,0.556129,128.82,1,0,0,0,0
4413,0.553095,149.85,1,0,0,0,0


In [43]:
y = scaled_df.SMP
X = scaled_df.drop('SMP',axis=1)
X

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW
4409,0.573726,1,0,0,0,0
4410,0.567658,1,0,0,0,0
4411,0.560983,1,0,0,0,0
4412,0.556129,1,0,0,0,0
4413,0.553095,1,0,0,0,0
...,...,...,...,...,...,...
30708,0.236347,0,0,0,0,0
30709,0.236650,0,0,0,0,0
30710,0.230279,0,0,0,0,0
30711,0.230279,0,0,0,0,0


In [44]:
# חלוקת הנתונים לסט אימון ולסט  מבחן
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
train_df = pd.merge(left=X_train, right=y_train, left_index=True, right_index=True)
test_df = pd.merge(left=X_test, right=y_test, left_index=True, right_index=True)
test_df.head(10)

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP
4829,0.594357,1,0,0,0,0,209.22
23808,0.393204,0,0,0,0,0,98.27
26836,0.277306,0,0,0,0,0,108.93
6489,0.522755,0,0,0,0,0,146.63
11794,0.221177,0,0,0,0,0,164.99
28011,0.469053,0,0,0,0,0,152.85
7415,0.403519,0,0,0,0,0,151.68
24421,0.31068,0,0,1,0,0,147.1
12769,0.129551,0,0,0,0,0,114.52
25625,0.285498,1,0,0,0,0,100.0


In [45]:
correlated = scaled_df.columns.tolist() # מערך של עמודות הטבלה
correlated = correlated[:-1] # פיצרים ללא עמודת המטרה

## הרצת המודל

In [46]:
lm = LinearRegression()  # הגדרת המודל
lm.fit(X_train,pd.DataFrame(y_train))              # אימון המודל
# בדיקת המקדמים
coeff = lm.coef_[0]
intercept = lm.intercept_[0]
print('Coefficients: \n', "coeff =", coeff , ",  Intercept=",intercept,"\n" )
print("The regression equation is:" ,"SMP = " ,intercept," + ",coeff[0]," * ",correlated[0]," + ",coeff[1]," * ",correlated[1]," + ",coeff[2]," * ",correlated[2], " + ",coeff[3]," * ",correlated[3]," + ",coeff[4]," * ",correlated[4]," + ",coeff[5]," * ",correlated[5])

Coefficients: 
 coeff = [ 2.92296675e+01  6.94275884e+00  3.55271368e-15 -1.69503381e+00
  2.19107403e+01 -1.10985958e+01] ,  Intercept= 123.61708516489165 

The regression equation is: SMP =  123.61708516489165  +  29.229667483165812  *  Scaled_Temp  +  6.94275884190002  *  SMP  +  3.552713678800501e-15  *  Is_Holiday  +  -1.6950338065162631  *  Is_Vac  +  21.91074027691975  *  Is_Weekend  +  -11.098595837960685  *  Is_Cold


In [47]:
fitted_SMP = lm.predict(X_train) # returns ndarray
predicted_train_SMP = round(pd.Series(fitted_SMP[:,0], index=y_train.index, name='Predicted_train_SMP'),ndigits=2)


In [48]:
train_df = pd.merge(left=train_df, right=predicted_train_SMP , left_index=True, right_index=True)
train_df.head(10)

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_train_SMP
29384,0.102245,0,0,0,1,0,107.76,148.52
5465,0.523968,0,0,0,0,0,196.81,138.93
6406,0.726335,0,0,0,0,0,135.99,144.85
30674,0.182646,0,0,0,0,0,95.0,128.96
25536,0.526092,1,0,0,0,0,111.27,145.94
7896,0.293386,0,0,0,0,0,138.6,132.19
22148,0.681432,1,0,0,0,0,118.33,150.48
28830,0.166566,0,0,1,1,0,152.76,148.7
9877,0.108313,0,0,0,1,0,165.01,148.69
27136,0.308556,0,0,1,0,0,106.04,130.94


In [49]:
fitted_SMP = lm.predict(X_test) # בדיקת חיזויי אימון
predicted_test_SMP = round(pd.Series(fitted_SMP[:,0], index=y_test.index, name='Predicted_test_SMP'),ndigits=2)
test_df = pd.merge(left=test_df, right=predicted_test_SMP, left_index=True, right_index=True)
test_df.head()

Unnamed: 0,Scaled_Temp,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_test_SMP
4829,0.594357,1,0,0,0,0,209.22,147.93
23808,0.393204,0,0,0,0,0,98.27,135.11
26836,0.277306,0,0,0,0,0,108.93,131.72
6489,0.522755,0,0,0,0,0,146.63,138.9
11794,0.221177,0,0,0,0,0,164.99,130.08


In [50]:
train_df['residuals'] = train_df.Predicted_train_SMP - train_df.SMP
test_df['residuals'] = test_df.Predicted_test_SMP - test_df.SMP


In [51]:
#plot the residuals graph
fig= go.Figure()
fig.add_trace(
    go.Scatter(
        x=train_df.Predicted_train_SMP,
        y=train_df.residuals,
        mode='markers',
        name='train residuals',
        marker_color='blue',
        marker_size=1.5,
        marker_line_width=0,
    )
)
fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals,
      mode='markers',
      name='test residuals',
      marker_color='red',
      marker_size=1.5,
      marker_line_width=0,
  )
)

fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals*0,
      mode='lines',
      name='zero line',
      marker_color='black',
      marker_size=1.5,
      marker_line_width=0,

  )
)
fig.update_layout(
    title="Residuals of Predicted SMP",
    xaxis_title="Predicted SMP",
    yaxis_title="Residuals",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)
fig.show()

## הערכת המודל

In [52]:
print("------ TRAIN DATA ------")
print("MSE:",metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP)))
print("MAE:",metrics.mean_absolute_error(train_df.SMP, train_df.Predicted_train_SMP))
print("Train STD:",train_df['SMP'].std())
# שגיאות ממוצעות של המודל
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TRAIN DATA ------
MSE: 741.1976170988257
RMSE: 27.22494475841642
MAE: 22.99727413568167
Train STD: 27.952716790234003


In [53]:
std = train_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט האימון יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
    # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [54]:
print("------ TEST DATA ------")
print("MSE:",metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP)))
print("MAE:",metrics.mean_absolute_error(test_df.SMP, test_df.Predicted_test_SMP))
print("Test STD:",test_df['SMP'].std())
# הערכים זהים בסט האימון ובסט המבחן מה שמראה על מודל טוב
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TEST DATA ------
MSE: 728.8162794330289
RMSE: 26.996597552895974
MAE: 23.055313926940638
Test STD: 27.565834927418027


In [55]:
std = test_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט המבחן יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')

RMSE < STD(y)


In [56]:
r2_score(train_df.SMP, train_df.Predicted_train_SMP)

0.05131642991256924

In [57]:
r2_score(test_df.SMP, test_df.Predicted_test_SMP)

0.040691265489292694

#מודל ללא טמפרטורה

In [58]:
regressionotemp = table.copy()
regressionotemp.drop(['Date', 'Hour','Temp'], axis=1, inplace=True)

In [59]:
y = regressionotemp.SMP
X = regressionotemp.drop('SMP',axis=1)
X

Unnamed: 0,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
30708,0,0,0,0,0
30709,0,0,0,0,0
30710,0,0,0,0,0
30711,0,0,0,0,0


In [60]:
# חלוקת הנתונים לסט אימון ולסט  מבחן
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
train_df = pd.merge(left=X_train, right=y_train, left_index=True, right_index=True)
test_df = pd.merge(left=X_test, right=y_test, left_index=True, right_index=True)
test_df.head(10)

Unnamed: 0,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP
29440,0,0,1,0,0,101.56
26386,0,0,0,0,0,158.9
29001,0,0,0,0,0,110.55
11833,0,0,0,0,0,109.41
18029,0,1,1,0,0,120.26
10497,0,0,0,0,0,133.88
10982,0,0,1,0,0,154.24
17570,0,1,0,0,0,117.5
18202,0,1,0,0,0,115.39
21176,0,0,0,0,1,115.85


In [61]:
correlated = regressionotemp.columns.tolist() # מערך של עמודות הטבלה
correlated = correlated[:-1] # פיצרים ללא עמודת המטרה

## הרצת המודל

In [62]:
lm = LinearRegression()  # הגדרת מודל
lm.fit(X_train,pd.DataFrame(y_train))              # אימון מודל
# בדיקת מקדמים
coeff = lm.coef_[0]
intercept = lm.intercept_[0]
print('Coefficients: \n', "coeff =", coeff , ",  Intercept=",intercept,"\n" )
print("The regression equation is:" ,"SMP = " ,intercept," + ",coeff[0]," * ",correlated[0]," + ",coeff[1]," * ",correlated[1]," + ",coeff[2]," * ",correlated[2], " + ",coeff[3]," * ",correlated[3]," + ",coeff[4]," * ",correlated[4])

Coefficients: 
 coeff = [ 3.59357453 10.80020825 -0.78324569 14.59071124 15.44210015] ,  Intercept= 134.5451791297932 

The regression equation is: SMP =  134.5451791297932  +  3.593574526080246  *  SMP  +  10.800208249911531  *  Is_Holiday  +  -0.7832456906961036  *  Is_Vac  +  14.590711240706295  *  Is_Weekend  +  15.442100150223226  *  Is_Cold


In [63]:
fitted_SMP = lm.predict(X_train) # returns ndarray
predicted_train_SMP = round(pd.Series(fitted_SMP[:,0], index=y_train.index, name='Predicted_train_SMP'),ndigits=2)


In [64]:
train_df = pd.merge(left=train_df, right=predicted_train_SMP , left_index=True, right_index=True)
train_df.head(10)

Unnamed: 0,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_train_SMP
24194,0,0,0,0,0,113.66,134.55
1421,0,1,0,0,1,198.68,160.79
10418,0,0,0,0,0,139.48,134.55
22698,0,0,1,0,0,158.0,133.76
15266,0,0,0,0,0,117.81,134.55
9131,0,0,0,0,0,153.83,134.55
16247,0,0,0,0,0,118.41,134.55
8661,1,0,1,0,0,138.95,137.36
10727,0,0,0,1,0,187.61,149.14
11408,0,0,0,0,0,186.03,134.55


In [65]:
fitted_SMP = lm.predict(X_test) # בדיקת חיזויי אימון
predicted_test_SMP = round(pd.Series(fitted_SMP[:,0], index=y_test.index, name='Predicted_test_SMP'),ndigits=2)
test_df = pd.merge(left=test_df, right=predicted_test_SMP, left_index=True, right_index=True)
test_df.head()

Unnamed: 0,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,Is_HW,SMP,Predicted_test_SMP
29440,0,0,1,0,0,101.56,133.76
26386,0,0,0,0,0,158.9,134.55
29001,0,0,0,0,0,110.55,134.55
11833,0,0,0,0,0,109.41,134.55
18029,0,1,1,0,0,120.26,144.56


In [66]:
train_df['residuals'] = train_df.Predicted_train_SMP - train_df.SMP
test_df['residuals'] = test_df.Predicted_test_SMP - test_df.SMP


In [67]:
#גרף טעויות
fig= go.Figure()
fig.add_trace(
    go.Scatter(
        x=train_df.Predicted_train_SMP,
        y=train_df.residuals,
        mode='markers',
        name='train residuals',
        marker_color='blue',
        marker_size=1.5,
        marker_line_width=0,
    )
)
fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals,
      mode='markers',
      name='test residuals',
      marker_color='red',
      marker_size=1.5,
      marker_line_width=0,
  )
)

fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals*0,
      mode='lines',
      name='zero line',
      marker_color='black',
      marker_size=1.5,
      marker_line_width=0,

  )
)
fig.update_layout(
    title="Residuals of Predicted SMP",
    xaxis_title="Predicted SMP",
    yaxis_title="Residuals",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)
fig.show()

## הערכת המודל

In [68]:
print("------ TRAIN DATA ------")
print("MSE:",metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP)))
print("MAE:",metrics.mean_absolute_error(train_df.SMP, train_df.Predicted_train_SMP))
print("Train STD:",train_df['SMP'].std())
# שגיאות ממוצעות של המודל
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TRAIN DATA ------
MSE: 844.8987894948602
RMSE: 29.06714278175377
MAE: 24.09261640076283
Train STD: 29.99146412782013


In [69]:
std = train_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט האימון יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
    # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [70]:
print("------ TEST DATA ------")
print("MSE:",metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP)))
print("MAE:",metrics.mean_absolute_error(test_df.SMP, test_df.Predicted_test_SMP))
print("Test STD:",test_df['SMP'].std())
# הערכים זהים בסט האימון ובסט המבחן מה שמראה על מודל טוב
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TEST DATA ------
MSE: 848.1625291675712
RMSE: 29.123230060684737
MAE: 24.227032776210116
Test STD: 30.021775985783947


In [71]:
std = test_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט המבחן יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
  # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [72]:
r2_score(train_df.SMP, train_df.Predicted_train_SMP)

0.06064542782213567

In [73]:
r2_score(test_df.SMP, test_df.Predicted_test_SMP)

0.05886167611942783

#מודל ללא פיצר גל חום

In [74]:
regressionohw = table.copy()
regressionohw.drop(['Date', 'Hour','Is_HW'], axis=1, inplace=True)

In [75]:

scaler = MinMaxScaler()
scaler_TMP = MinMaxScaler()
scaled_Temp = pd.DataFrame(scaler_TMP.fit_transform(regression[['Temp']]))
regression_for_scale = regressionohw.drop('Temp',axis=1)
scaled_df = pd.merge(left= scaled_Temp, right=regressionohw, left_index=True, right_index=True)

scaled_df.rename(columns={ 0: "Temperature"},inplace = True)
scaled_df.drop('Temp',axis=1,inplace=True)

scaled_df.head()

Unnamed: 0,Temperature,SMP,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold
0,0.606796,146.47,0,1,0,0
1,0.601638,171.22,0,1,0,0
2,0.597694,128.82,0,1,0,0
3,0.592233,149.85,0,1,0,0
4,0.590413,146.53,0,1,0,0


In [76]:
y = scaled_df.SMP
X = scaled_df.drop('SMP',axis=1)
X

Unnamed: 0,Temperature,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold
0,0.606796,0,1,0,0
1,0.601638,0,1,0,0
2,0.597694,0,1,0,0
3,0.592233,0,1,0,0
4,0.590413,0,1,0,0
...,...,...,...,...,...
30708,0.236347,0,0,0,0
30709,0.236650,0,0,0,0
30710,0.230279,0,0,0,0
30711,0.230279,0,0,0,0


In [77]:
# חלוקת הנתונים לסט אימון ולסט  מבחן
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
train_df = pd.merge(left=X_train, right=y_train, left_index=True, right_index=True)
test_df = pd.merge(left=X_test, right=y_test, left_index=True, right_index=True)
test_df.head(10)

Unnamed: 0,Temperature,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,SMP
29440,0.339806,0,0,1,0,101.56
26386,0.311286,0,0,0,0,158.9
29001,0.273665,0,0,0,0,110.55
11833,0.394114,0,0,0,0,109.41
18029,0.651092,0,1,1,0,120.26
10497,0.243932,0,0,0,0,133.88
10982,0.304308,0,0,1,0,154.24
17570,0.475425,0,1,0,0,117.5
18202,0.678398,0,1,0,0,115.39
21176,0.649879,0,0,0,0,115.85


In [78]:
correlated = regressionohw.columns.tolist() # מערך של עמודות הטבלה
correlated = correlated[:-1] # פיצרים ללא עמודת המטרה

## הרצת המודל

In [79]:
lm = LinearRegression()  # הגדרת מודל
lm.fit(X_train,pd.DataFrame(y_train))              # אימון מודל
# בדיקת מקדמים
coeff = lm.coef_[0]
intercept = lm.intercept_[0]
print('Coefficients: \n', "coeff =", coeff , ",  Intercept=",intercept,"\n" )
print("The regression equation is:" ,"SMP = " ,intercept," + ",coeff[0]," * ",correlated[0]," + ",coeff[1]," * ",correlated[1]," + ",coeff[2]," * ",correlated[2], " + ",coeff[3]," * ",correlated[3]," + ",coeff[4]," * ",correlated[4])

Coefficients: 
 coeff = [43.58502986  1.97524616  3.31921958 -0.669463   26.30353209] ,  Intercept= 117.30983455656619 

The regression equation is: SMP =  117.30983455656619  +  43.585029859185994  *  Temp  +  1.9752461595560518  *  SMP  +  3.3192195847146833  *  Is_Holiday  +  -0.6694629962285973  *  Is_Vac  +  26.303532094848705  *  Is_Weekend


In [80]:
fitted_SMP = lm.predict(X_train) # בדיקת חיזויי אימון
predicted_train_SMP = round(pd.Series(fitted_SMP[:,0], index=y_train.index, name='Predicted_train_SMP'),ndigits=2)


In [81]:
train_df = pd.merge(left=train_df, right=predicted_train_SMP , left_index=True, right_index=True)
train_df.head(10)

Unnamed: 0,Temperature,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,SMP,Predicted_train_SMP
24194,0.293083,0,0,0,0,113.66,130.08
1421,0.834345,0,1,0,0,198.68,156.99
10418,0.164745,0,0,0,0,139.48,124.49
22698,0.504551,0,0,1,0,158.0,138.63
15266,0.533677,0,0,0,0,117.81,140.57
9131,0.233617,0,0,0,0,153.83,127.49
16247,0.590716,0,0,0,0,118.41,143.06
8661,0.233313,1,0,1,0,138.95,128.78
10727,0.098301,0,0,0,1,187.61,147.9
11408,0.231189,0,0,0,0,186.03,127.39


In [82]:
fitted_SMP = lm.predict(X_test) # בדיקת סט מבחן
predicted_test_SMP = round(pd.Series(fitted_SMP[:,0], index=y_test.index, name='Predicted_test_SMP'),ndigits=2)
test_df = pd.merge(left=test_df, right=predicted_test_SMP, left_index=True, right_index=True)
test_df.head()

Unnamed: 0,Temperature,Is_Holiday,Is_Vac,Is_Weekend,Is_Cold,SMP,Predicted_test_SMP
29440,0.339806,0,0,1,0,101.56,131.45
26386,0.311286,0,0,0,0,158.9,130.88
29001,0.273665,0,0,0,0,110.55,129.24
11833,0.394114,0,0,0,0,109.41,134.49
18029,0.651092,0,1,1,0,120.26,148.34


In [83]:
train_df['residuals'] = train_df.Predicted_train_SMP - train_df.SMP
test_df['residuals'] = test_df.Predicted_test_SMP - test_df.SMP


In [84]:
#גרף טעויות
fig= go.Figure()
fig.add_trace(
    go.Scatter(
        x=train_df.Predicted_train_SMP,
        y=train_df.residuals,
        mode='markers',
        name='train residuals',
        marker_color='blue',
        marker_size=1.5,
        marker_line_width=0,
    )
)
fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals,
      mode='markers',
      name='test residuals',
      marker_color='red',
      marker_size=1.5,
      marker_line_width=0,
  )
)

fig.add_trace(
  go.Scatter(
      x=test_df.Predicted_test_SMP,
      y=test_df.residuals*0,
      mode='lines',
      name='zero line',
      marker_color='black',
      marker_size=1.5,
      marker_line_width=0,

  )
)
fig.update_layout(
    title="Residuals of Predicted SMP",
    xaxis_title="Predicted SMP",
    yaxis_title="Residuals",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)
fig.show()

## הערכת המודל

In [85]:
print("------ TRAIN DATA ------")
print("MSE:",metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP)))
print("MAE:",metrics.mean_absolute_error(train_df.SMP, train_df.Predicted_train_SMP))
print("Train STD:",train_df['SMP'].std())
# שגיאות ממוצעות של המודל
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TRAIN DATA ------
MSE: 818.3525678938555
RMSE: 28.606862251806916
MAE: 23.92861109819061
Train STD: 29.99146412782013


In [86]:
std = train_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט האימון יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(train_df.SMP, train_df.Predicted_train_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
  # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [87]:
print("------ TEST DATA ------")
print("MSE:",metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
print("RMSE:",np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP)))
print("MAE:",metrics.mean_absolute_error(test_df.SMP, test_df.Predicted_test_SMP))
print("Test STD:",test_df['SMP'].std())
# הערכים זהים בסט האימון ובסט המבחן מה שמראה על מודל טוב
# MAE - נשתמש רק ב
# כי המודל שלנו מכיל ערכי קיצון

------ TEST DATA ------
MSE: 821.9660322498372
RMSE: 28.669949986873664
MAE: 24.0864369437812
Test STD: 30.021775985783947


In [88]:
std = test_df['SMP'].std() # פונקציה שבודקת האם הסטייה של סט המבחן יותר גדולה או קטנה מסטיית הטקן של פונקציית המטרה 
rmse = np.sqrt(metrics.mean_squared_error(test_df.SMP, test_df.Predicted_test_SMP))
if rmse < std: 
  print('RMSE < STD(y)')
else:
  print('RMSE > STD(y)')
  # מאחר ו RMSE
  # קטן מסטיית התקן אנו יודעים שהמודל הינו טוב

RMSE < STD(y)


In [89]:
r2_score(train_df.SMP, train_df.Predicted_train_SMP)

0.09015939440014353

In [90]:
r2_score(test_df.SMP, test_df.Predicted_test_SMP)

0.08792983977067503