In [None]:
# importing libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

## **Task 01**

In [None]:
# Download dataset
!gdown --id 1bTtIDHMP6a5dQ3a2Aw9AAR8Flprjb3Ei

In [None]:
# Reading data
data = pd.read_csv('/content/data.csv')

In [None]:
# Printing total number of samples, column names, null values in each column and data types of then columns
data.replace(0, pd.np.nan, inplace = True)
data.info()

In [None]:
# Dropping columns with too many null values or non-numeric data type
data.drop(['date', 'waterfront', 'view', 'sqft_basement', 'yr_renovated', 'street', 'city', 'statezip', 'country'], inplace = True, axis=1)
# dropping samples with null values
data.dropna(inplace = True)
data.info()

## **Task 02**
**Feature columns**:

bedrooms, bathrooms, sqft_living, sqft_lot, floors, condition, sqft_above, yr_built

**Value to be predicted**:

price

## **Task 03**

In [None]:
X = data.drop(['price'], axis=1).values # removing output column
X = (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0)) # feature scaling
X = np.insert(X, 0, np.ones(len(X)), axis = 1)  # adding column of 1s
Y = data['price'].values  # output column
# Splitting dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state = 101)

##**Task 04**

In [None]:
def gradient_descent(X, Y, lr=0.01, epoch=12):
  logs, mse = [], []
  samples = len(X)
  features = len(X[0])

  X = X.reshape(samples, features)
  Y = Y.reshape(samples, 1)
  params = np.zeros((features, 1))

  for i in range(epoch):
    hypothesis = np.dot(X, params)
    loss = hypothesis - Y
    params -= (lr / samples) * np.dot(X.transpose(), loss)
    
    logs.append(params)
    mse.append(mean_squared_error(Y, hypothesis))
    
    # Task 04.a
    
    if (i != 0 and abs(mse[i] - mse[i - 1]) <= 0.5):
      break
  return params, logs, mse

In [None]:
params, logs, mse = gradient_descent(X_train, Y_train, epoch = 200)

### **Task 04.b**

In [None]:
def plot_mse(mse_values, labels):
  error_fig = go.Figure()
  for mse, label in zip(mse_values, labels):
    error_fig.add_trace(go.Scatter(x=list(range(len(mse))), y=mse, name=label, mode='lines+markers', marker_color='rgba(0, 152, 0, .8)'))
  
  error_fig.update_layout(title = f'MSE vs Iterations',title_x=0.5, xaxis_title= "Iterations", yaxis_title="MSE")
  error_fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
  error_fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)

  error_fig.show()

In [None]:
plot_mse([mse], ['MSE'])

### **Task 04.c**

In [None]:
i = 0.001
while i <= 0.1:
  params, logs, mse = gradient_descent(X_train, Y_train, lr=i, epoch = 20)
  print("Step Size: " + str(i))
  print("Training Error: " + str(mean_squared_error(Y_train, np.dot(X_train, params))))
  print("Testing Error: " + str(mean_squared_error(Y_test, np.dot(X_test, params))))
  i = i * 10

### **Task 05.a**

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
data = pd.read_csv('/content/data.csv')
X = data.drop(['date', 'price', 'street', 'city', 'statezip', 'country'], axis=1)
Y = data['price']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state = 101)
regressor.fit(X_train, Y_train)

In [None]:
# Coefficients
print(regressor.coef_)

In [None]:
# Mean Squared Error
Y_pred = regressor.predict(X_test)
mean_squared_error(Y_test, Y_pred)

### **Task 05.b**

In [None]:
# Creating training set
training_set = X_train
training_set['price'] = Y_train

In [None]:
# Pearson Correlation Matrix
training_set.corr()

In [None]:
# Hand-picking features
X_train = training_set.drop(["bathrooms", "sqft_living", "sqft_lot", "waterfront", "sqft_above", "sqft_basement", "yr_built", "price"], axis = 1)
X_test = X_test.drop(["bathrooms", "sqft_living", "sqft_lot", "waterfront", "sqft_above", "sqft_basement", "yr_built"], axis = 1)

In [None]:
# Re-running Linear Regression
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)
mean_squared_error(Y_test, Y_pred)