In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
pd.options.plotting.backend = 'plotly'

df = pd.read_csv("data/Fish.csv")

df_perch = df[df.Species == 'Perch']
fig = px.scatter(df_perch, x='Height', y='Weight')


In [3]:
fig.show()

In [4]:
mean_val= df_perch.Weight.mean()
fig = fig.add_hline(y = mean_val,row= None, col = None)

In [5]:
fig.show()

In [6]:
for weight,height in zip(df_perch.Weight,df_perch.Height):
    fig.add_shape(type='line',x0=height,x1=height,y0=weight,y1=mean_val,line=dict(width=1,dash='dash',color='red'))
    

In [7]:
fig.show()

In [8]:
residuals_squared = (df_perch.Weight - mean_val)**2
residuals_squared.sum()

6646094.253571428

In [1]:
def LS(x, y):
    
    x_bar = np.average(x)
    y_bar = np.average(y)

    numerator = np.sum((x - x_bar)*(y - y_bar))
    denominator = np.sum((x - x_bar)**2)
    beta_hat = numerator / denominator

    alpha_hat = y_bar - beta_hat*x_bar

    best_model = {'alpha_hat':alpha_hat, 'beta_hat':beta_hat}

    return best_model

In [10]:
x= df_perch.Height
y = df_perch.Weight
best_model  = LS(x,y)

In [11]:
best_model

{'alpha_hat': -537.3275192931233, 'beta_hat': 116.96540985551397}

In [12]:
y_hat = best_model['alpha_hat'] + best_model['beta_hat'] * x
fig.add_scatter(x=x,y=y_hat, line=dict(color='green'),name='LS Fit')

In [13]:
for weight,height,res in zip(df_perch.Weight,df_perch.Height,y_hat):
    fig.add_shape(type='line',x0=height,x1=height,y0=weight,y1=res,line=dict(width=1,dash='dash',color='red'))

fig.show()

In [14]:
ls_fit_residuals = (df_perch.Weight - y_hat)**2
print(ls_fit_residuals.sum())
print(ls_fit_residuals.sum()/len(x))

412872.84783624834
7372.7294256472915


In [15]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_hat)

7372.7294256472915

In [16]:
1 - (ls_fit_residuals.sum() / residuals_squared.sum())

0.9378773709665068

In [17]:
from sklearn.metrics import r2_score
r2_score(y,y_hat)

0.9378773709665068

In [37]:
import plotly.graph_objects as go
res_plot = go.Figure(go.Scatter(x=y_hat, y = y-y_hat,mode='markers'))

res_plot.show()

In [30]:
RSS = ls_fit_residuals.sum()
TSS = residuals_squared.sum()
p = 2
n= len(x)

F_statistic = ((TSS - RSS)/(p-1))/(RSS/(n-p))
print(F_statistic)

815.2484661408347


In [20]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(x.values.reshape(-1, 1),y)

print("Intercept: " + str(model.intercept_) , "Coefficient: " + str(model.coef_[0]))


Intercept: -537.3275192931234 Coefficient:116.96540985551398


In [26]:
import statsmodels.api as sm
x = sm.add_constant(x.values.reshape(-1,1))
model_sm = sm.OLS(y,x).fit()

In [45]:
model_sm.f_pvalue

2.9167064949774088e-34

In [27]:
model_sm.summary()

0,1,2,3
Dep. Variable:,Weight,R-squared:,0.938
Model:,OLS,Adj. R-squared:,0.937
Method:,Least Squares,F-statistic:,815.2
Date:,"Fri, 18 Mar 2022",Prob (F-statistic):,2.92e-34
Time:,14:21:40,Log-Likelihood:,-328.82
No. Observations:,56,AIC:,661.6
Df Residuals:,54,BIC:,665.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-537.3275,34.260,-15.684,0.000,-606.015,-468.640
x1,116.9654,4.096,28.553,0.000,108.752,125.178

0,1,2,3
Omnibus:,11.275,Durbin-Watson:,0.678
Prob(Omnibus):,0.004,Jarque-Bera (JB):,11.319
Skew:,0.954,Prob(JB):,0.00349
Kurtosis:,4.099,Cond. No.,24.8
