## Linear Regression

In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib as plt

# Importing Linear Reg model from Scikit-Learn
from sklearn.linear_model import LinearRegression

In [4]:
# Importing dataset
data = pd.read_csv(r'C:\Users\abhirav\Downloads\california_housing_train.csv')
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [9]:
# Checking shape, info & Missing/Nan Values
print('Shape of the dataset: ',data.shape)
print('----------------------------------------------------')
print(data.info())

Shape of the dataset:  (17000, 9)
----------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB
None


**There is no missing data in our Dataset**

In [10]:
# Creating x and y
# Here house value is our dependent variable and rest are independent.
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

X.shape, y.shape

((17000, 8), (17000,))

In [12]:
# Splitting data into Training and Test data
# Training data 80% of the total -> 17000*0.8 = 13600
# Test data 20% -> 17000*0.2
X_train = X[:13600]
y_train = y[:13600]
X_test = X[13600:]
y_test = y[13600:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13600, 8), (3400, 8), (13600,), (3400,))

In [13]:
# set the random seed
np.random.seed(42)

# Instantiate our model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_preds = model.predict(X_test)

In [15]:
np.random.seed(42)

# Evaluate the model using score method
model.score(X_test, y_test)

0.6557619923906499

In [16]:
# Evaluate the model using sklearn.metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print('Coefficient of Determination R^2: ', r2_score(y_test, y_preds))
print('Mean absolute error MAE         : ', mean_absolute_error(y_test, y_preds))
print('Mean squared error MSE          : ', mean_squared_error(y_test, y_preds))

Coefficient of Determination R^2:  0.6557619923906499
Mean absolute error MAE         :  56315.258821328796
Mean squared error MSE          :  5608835328.58844


**Since R2_score is low and MAE/MSE being on higher side we need further optimization**