# Example Linear Regression

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
p_df = sns.load_dataset('penguins')

In [3]:
p_df = p_df.dropna()

In [4]:
p_df = p_df.reset_index()

In [5]:
p_df.head()

Unnamed: 0,index,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
4,5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


### We want to predict the weight of the penguins based on their flipper_length!

## X-y split

In [49]:
X = p_df['flipper_length_mm']
y = p_df['body_mass_g']

## Train-Test split

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [52]:
X_train

22     187.0
284    221.0
294    212.0
56     185.0
175    205.0
       ...  
188    196.0
71     184.0
106    193.0
270    220.0
102    181.0
Name: flipper_length_mm, Length: 233, dtype: float64

In [16]:
X_test

25     178.0
309    222.0
73     195.0
195    198.0
57     192.0
       ...  
15     174.0
6      195.0
209    207.0
93     192.0
30     196.0
Name: flipper_length_mm, Length: 100, dtype: float64

In [18]:
y_train

array([[3200.],
       [5100.],
       [4725.],
       [3600.],
       [4550.],
       [3700.],
       [5550.],
       [5850.],
       [3500.],
       [3750.],
       [3000.],
       [4625.],
       [3825.],
       [3700.],
       [4300.],
       [3700.],
       [3325.],
       [4200.],
       [3550.],
       [3400.],
       [3950.],
       [3325.],
       [4900.],
       [3650.],
       [3800.],
       [4925.],
       [2900.],
       [3550.],
       [4250.],
       [3650.],
       [3075.],
       [4250.],
       [3600.],
       [3700.],
       [3150.],
       [3100.],
       [6050.],
       [4150.],
       [5000.],
       [3700.],
       [3500.],
       [5200.],
       [3400.],
       [4750.],
       [3250.],
       [5000.],
       [3650.],
       [5950.],
       [5100.],
       [4400.],
       [3775.],
       [4550.],
       [3900.],
       [4400.],
       [5050.],
       [4050.],
       [4400.],
       [3350.],
       [3250.],
       [3250.],
       [3875.],
       [5500.],
       [

In [19]:
y_test

25     3250.0
309    4875.0
73     4000.0
195    3675.0
57     4050.0
        ...  
15     3400.0
6      4675.0
209    4000.0
93     4100.0
30     4150.0
Name: body_mass_g, Length: 100, dtype: float64

## Model Generation

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
X_train = np.array(X_train).reshape(-1,1)
y_train = np.array(y_train).reshape(-1,1)

In [13]:
lm = LinearRegression()
model = lm.fit(X_train,y_train)

In [20]:
print(model.coef_)
print(model.intercept_)

[[51.32743793]]
[-6118.66754338]


In [23]:
X_test = np.array(X_test).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

In [24]:
y_pred = model.predict(X_test)

In [25]:
y_test - y_pred

array([[ 232.38359244],
       [-401.02367633],
       [ 109.81714769],
       [-369.16516609],
       [ 313.79946147],
       [ -12.74929707],
       [ 222.64888574],
       [-290.18285231],
       [ -12.52797646],
       [-248.14747987],
       [ 758.48970976],
       [ 355.83483391],
       [  17.56045464],
       [-287.52797646],
       [-260.09442121],
       [  23.09152695],
       [-329.5633489 ],
       [-236.20053853],
       [ 320.4366511 ],
       [-465.40417292],
       [ 133.48970976],
       [  91.45433732],
       [ -83.98830389],
       [-250.80235573],
       [ 453.40127866],
       [-630.89078683],
       [ 285.92326501],
       [-164.07673499],
       [ 528.40127866],
       [ 314.68425818],
       [-579.5633489 ],
       [-136.64317975],
       [ 405.83483391],
       [-699.4749178 ],
       [  13.35682025],
       [  -8.76698329],
       [ -40.18285231],
       [-607.43954536],
       [-183.54566268],
       [-378.23591097],
       [ 128.40127866],
       [-183.545

In [26]:
compare = pd.DataFrame({'y_test':list(y_test), 'y_pred':list(y_pred)})

In [27]:
compare

Unnamed: 0,y_test,y_pred
0,[3250.0],[3017.616407560008]
1,[4875.0],[5276.023676332345]
2,[4000.0],[3890.182852312955]
3,[3675.0],[4044.1651660928874]
4,[4050.0],[3736.200538533025]
...,...,...
95,[3400.0],[2812.306655853432]
96,[4675.0],[3890.182852312955]
97,[4000.0],[4506.112107432684]
98,[4100.0],[3736.200538533025]


#### Model evaluation

In [53]:
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse

In [54]:
mse(y_test,y_pred)

138931.09920252106

In [55]:
mae(y_test,y_pred)

299.39047755852704

In [56]:
import matplotlib.pyplot as plt