<a href="https://colab.research.google.com/github/lisabroadhead/data_science_machine-learning/blob/main/reading_june_27.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Regression in Python

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import set_config
set_config(display='diagram')

In [2]:
file = '/content/drive/MyDrive/Colab Notebooks/coding_dojo/Machine Learning/files/cali_housing.csv'

df = pd.read_csv(file)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [5]:
y = df['MedHouseVal']
X = df.drop(columns='MedHouseVal')

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [6]:
scaler = StandardScaler()

In [8]:
reg = LinearRegression()

In [9]:
reg_pipeline = make_pipeline(scaler,reg)

In [10]:
reg_pipeline.fit(X_train, y_train)

In [24]:
perdictions = reg_pipeline.predict(X_test)

In [26]:
prediction_df = X_test.copy()
prediction_df['True Median Price'] = y_test
prediction_df['Predicted Median Price'] = perdictions
prediction_df['Error'] = perdictions - y_test
prediction_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,True Median Price,Predicted Median Price,Error
20046,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01,0.477,0.724128,0.247128
3024,2.5313,30.0,5.039384,1.193493,1565.0,2.679795,35.14,-119.46,0.458,1.766778,1.308778
15663,3.4801,52.0,3.977155,1.185877,1310.0,1.360332,37.8,-122.44,5.00001,2.711516,-2.288494
20484,5.7376,17.0,6.163636,1.020202,1705.0,3.444444,34.28,-118.72,2.186,2.836012,0.650012
9814,3.725,34.0,5.492991,1.028037,1063.0,2.483645,36.62,-121.93,2.78,2.603755,-0.176245


# Regression Metrics

In [46]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [50]:
california = fetch_california_housing(as_frame=True)

df=california.frame

df['MedHouseVal'] = df['MedHouseVal'] * 100000

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,452600.0
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,358500.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,352100.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,341300.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,342200.0


In [52]:
df.duplicated().sum()

0

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [54]:
df.isna().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [57]:
y = df['MedHouseVal']
X = df.drop(columns='MedHouseVal')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [60]:
scaler = StandardScaler()
reg = LinearRegression()

lin_reg_pipe = make_pipeline(scaler, reg)

lin_reg_pipe.fit(X_train, y_train)

In [80]:
train_pred = lin_reg_pipe.predict(X_train)
test_pred = lin_reg_pipe.predict(X_test)

### MEAN ABSOLUTE ERROR

In [71]:
# # mean
# train_mae = np.mean(np.abs(train_pred - y_train))
# test_mae = np.mean(np.abs(test_pred - y_test))

# print(train_mae)
# print(test_mae)

In [88]:
# sklean
train_mae = mean_absolute_error(y_train, train_pred)
test_mae = mean_absolute_error(test_pred,y_test)
print(train_mae)
print(test_mae)

52951.54304301853
52969.64012919461


### MSE

In [92]:
train_mse = mean_squared_error(y_train, train_pred)
test_mse = mean_squared_error(test_pred, y_test)

print(train_mse)
print(test_mse)

5205522163.645129
5411287478.470688


### RMSE

In [98]:
train_rsme = np.sqrt(np.mean(np.abs(train_pred - y_train) ** 2))
test_rsme = np.sqrt(np.mean(np.abs(test_pred - y_test) **2))

print(train_rsme)
print(test_rsme)

72149.30466501482
73561.45375446769


### R2

In [100]:
train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print(train_r2)
print(test_r2)

0.609873031052925
0.5910509795491351
