# Assignment 1

- Name: **Arnab Sen**
- Roll: **510519006**
- Date: **Aug 12, 2022**

## (i) Download data

Data downloaded and stored at [drive](https://drive.google.com/drive/folders/1-2d4kuiufZHmaXDhBi2sAYeoYPgBnQqL).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
BASE_PATH = '/content/drive/MyDrive/Colab_Notebooks/ML_DRIVE/Assign_1/dataset'

In [None]:
import numpy as np
import pandas as pd # read csv files
from sklearn.model_selection import train_test_split # test and train data split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns # graph plot
import matplotlib.pyplot as plt # scatter plot
from sklearn.preprocessing import PolynomialFeatures # polynomial regression

## (ii) Reading and formating dataset 
Read the dataset in the Pandas data frame. Remove the rows with a missing value. Divide the training.csv into two sets of ratio 80:20 entitled to train and test set respectively.

In [None]:
dataset = pd.read_csv(f"{BASE_PATH}/train.csv")
scaler = StandardScaler()

In [None]:
dataset.head()

## (iii) Linear Regression

Use the linear regression method to estimate the slope and intercept for predicting `SalePrice` based on `LotArea`

Remove the missing rows corresponding to only `SalePrice` and `LotArea` columns.

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea']].dropna()

Splitting the dataset in 80:20 ratio for fitting the model and then testing.

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
sns.displot(train_df['SalePrice']);

In [None]:
train_df.head()

In [None]:
test_df.head()

Scatter plot to see the relationship between `SalePrice` and `LotArea`.

In [None]:
# ci: used to specify the size of the interval
sns.lmplot(x ="LotArea", y ="SalePrice", data = train_df, ci=None, order=2)

In [None]:
X_train = np.array(train_df['LotArea']).reshape(-1, 1) # have one column and as many rows as necessary
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df['SalePrice']).reshape(-1, 1)

X_test = np.array(test_df['LotArea']).reshape(-1, 1)
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df['SalePrice']).reshape(-1, 1)

In [None]:
model = LinearRegression()
 
new_model = model.fit(X_train, y_train)
y_pred = new_model.predict(X_test)

In [None]:
plt.xlabel("LotArea")
plt.ylabel("SalePrice")
plt.scatter(X_test, y_test)
plt.plot(X_test, y_pred, color ='r')
 
plt.show()

- **Coefficient of Determination:** With linear regression, the coefficient of determination is equal to the square of the correlation between the x and y variables.

In [None]:
print("Linear Regression\n====================")
print("coefficient of determination (r-squared):", new_model.score(X_test, y_test)) 
print("intercept:", new_model.intercept_)
print("slope:", new_model.coef_)
mse_linear_regression = mean_squared_error(y_test, y_pred)
print("mean squared error:", mse_linear_regression)

## (iv) Multiple Regression
### Model 1: LotFrontage, LotArea

In [None]:
lot_area_weights = [] # required for question (vii)

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea', 'LotFrontage']].dropna()

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sns.displot(train_df['SalePrice']);

In [None]:
variables = ['LotArea', 'LotFrontage']
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])

X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
model = LinearRegression()
new_model = model.fit(X_train, y_train)

In [None]:
y_pred = new_model.predict(X_train)
r2_m1_train = new_model.score(X_train, y_train)
mse_m1_train = mean_squared_error(y_train, y_pred)
print("[Model 1 training] mean squared error:", mse_m1_train)
print("[Model 1 training] r2 score:", r2_m1_train)

y_pred = new_model.predict(X_test)
r2_m1_test = new_model.score(X_test, y_test)
mse_m1_test = mean_squared_error(y_test, y_pred)
print("[Model 1 testing] mean squared error:", mse_m1_test)
print("[Model 1 testing] r2 score:", r2_m1_test)

In [None]:
print("Weights/Coefficients:\n")
for name, weight in zip(variables, new_model.coef_.reshape(-1)):
  if name == 'LotArea':
    lot_area_weights.append(weight)
    
  print(f"{name.ljust(25)} {weight}")

### Model 2: LotFrontage, LotArea, OverallQual, OverallCond

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea', 'LotFrontage', 'OverallQual', 'OverallCond']].dropna()

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sns.displot(train_df['SalePrice']);

In [None]:
variables = ['LotArea', 'LotFrontage', 'OverallQual', 'OverallCond']
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])

X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
model = LinearRegression()
new_model = model.fit(X_train, y_train)

In [None]:
y_pred = new_model.predict(X_train)
r2_m2_train = new_model.score(X_train, y_train)
mse_m2_train = mean_squared_error(y_train, y_pred)
print("[Model 2 training] mean squared error:", mse_m2_train)
print("[Model 2 training] r2 score:", r2_m2_train)

y_pred = new_model.predict(X_test)
r2_m2_test = new_model.score(X_test, y_test)
mse_m2_test = mean_squared_error(y_test, y_pred)
print("[Model 2 testing] mean squared error:", mse_m2_test)
print("[Model 2 testing] r2 score:", r2_m2_test)

In [None]:
print("Weights/Coefficients:\n")
for name, weight in zip(variables, new_model.coef_.reshape(-1)):
  if name == 'LotArea':
    lot_area_weights.append(weight)
  print(f"{name.ljust(25)} {weight}")

### Model 3: LotFrontage, LotArea, OverallQual, OverallCond, 1stFlrSF, GrLivArea

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea', 'LotFrontage', 'OverallQual', 'OverallCond', '1stFlrSF', 'GrLivArea']].dropna()

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sns.displot(train_df['SalePrice']);

In [None]:
variables = ['LotArea', 'LotFrontage', 'OverallQual', 'OverallCond', '1stFlrSF', 'GrLivArea']
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])

X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
model = LinearRegression()
new_model = model.fit(X_train, y_train)

In [None]:
y_pred = new_model.predict(X_train)
r2_m3_train = new_model.score(X_train, y_train)
mse_m3_train = mean_squared_error(y_train, y_pred)
print("[Model 3 training] mean squared error:", mse_m3_train)
print("[Model 3 training] r2 score:", r2_m3_train)

y_pred = new_model.predict(X_test)
r2_m3_test = new_model.score(X_test, y_test)
mse_m3_test = mean_squared_error(y_test, y_pred)
print("[Model 3 testing] mean squared error:", mse_m3_train)
print("[Model 3 testing] r2 score:", r2_m3_test)

In [None]:
print("Weights/Coefficients:\n")
for name, weight in zip(variables, new_model.coef_.reshape(-1)):
  if name == 'LotArea':
    lot_area_weights.append(weight)
  print(f"{name.ljust(25)} {weight}")

## (v) Compare the Mean squared Error, R2 score

#### Comparing R2 score

#### Comparing Mean Squared Error

In [None]:
plt.figure(figsize=(10, 8))
plt.xlabel("Models of multiple regression")
plt.ylabel("Mean Squared Error")
plt.title("Mean Squared Error comparison b/w models")
x = ["Model-1", "Model-2", "Model-3"]
x_axis = np.arange(len(x))
plt.bar(x_axis-0.2, [mse_m1_train, mse_m2_train, mse_m3_train], width=0.4, label="train")
plt.bar(x_axis+0.2, [mse_m1_test, mse_m2_test, mse_m3_test], width=0.4, label="test")
plt.xticks(x_axis, x)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.xlabel("Models of multiple regression")
plt.ylabel("R2 Score")
plt.title("R2 Score comparison b/w models")
plt.bar(x_axis-0.2, [r2_m1_train, r2_m2_train, r2_m3_train], width=0.4, label="train")
plt.bar(x_axis+0.2, [r2_m1_test, r2_m2_test, r2_m3_test], width=0.4, label="test")
plt.xticks(x_axis, x)
plt.legend()
plt.show()

## (vi) Multiple Regression (contd.)
### Model 4: LotArea, Street

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea', 'Street']].dropna()
df = pd.get_dummies(df, columns = ['Street'])
train_df, test_df = train_test_split(df, test_size=0.2)
sns.displot(train_df['SalePrice']);

In [None]:
test_df.head()

In [None]:
train_df.head()

In [None]:
variables = train_df.columns.array[1:]
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])
X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
model = LinearRegression()
new_model = model.fit(X_train, y_train)

In [None]:
y_pred = new_model.predict(X_test)
r2_score_m4 = new_model.score(X_test, y_test)
mse_m4 = mean_squared_error(y_test, y_pred)

In [None]:
print("Weights/Coefficients:\n")
for name, weight in zip(variables, new_model.coef_.reshape(-1)):
  if name == 'LotArea':
    lot_area_weights.append(weight)
  print(f"{name.ljust(25)} {weight}")

### Model 5: LotArea, OverallCond, Street, Neighborhood

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea', 'OverallCond', 'Street', 'Neighborhood']].dropna()
df = pd.get_dummies(df, columns = ['Street', 'Neighborhood'])
train_df, test_df = train_test_split(df, test_size=0.2)
sns.displot(train_df['SalePrice']);

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
variables = train_df.columns.array[1:]
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])

X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
model = LinearRegression()
new_model = model.fit(X_train, y_train)

In [None]:
y_pred = new_model.predict(X_test)
r2_score_m5 = r2_score(y_test, y_pred)
mse_m5 = mean_squared_error(y_test, y_pred)

In [None]:
print("Weights/Coefficients:\n")
for name, weight in zip(variables, new_model.coef_.reshape(-1)):
  if name == 'LotArea':
    lot_area_weights.append(weight)
  print(f"{name.ljust(25)} {weight}")

### Model 6: LotArea, OverallCond, Street, 1stFlrSF, Neighborhood, Year

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea', 'OverallCond', 'Street', '1stFlrSF', 'Neighborhood', 'YearBuilt']].dropna()
df = pd.get_dummies(df, columns = ['Street', 'Neighborhood'])
train_df, test_df = train_test_split(df, test_size=0.2)
sns.displot(train_df['SalePrice']);

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
variables = train_df.columns.array[1:]
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])

X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
model = LinearRegression()
new_model = model.fit(X_train, y_train)

In [None]:
y_pred = new_model.predict(X_test)
r2_score_m6 = r2_score(y_test, y_pred)
mse_m6 = mean_squared_error(y_test, y_pred)

In [None]:
print("Weights/Coefficients:\n")
for name, weight in zip(variables, new_model.coef_.reshape(-1)):
  if name == 'LotArea':
    lot_area_weights.append(weight)
  print(f"{name.ljust(25)} {weight}")

## (vi) Compare the feature “LotArea” weights/coefficients for all the six trained models and plot a graph using the Matplotlib library.

In [None]:
x_values = [f"Model-{x}" for x in range(1, 7)]
y_values = lot_area_weights
plt.figure(figsize=(10,8))
plt.bar(x_values, y_values)
plt.ylabel('Lot Area Coefficient/Weights')
plt.xlabel('Models')
plt.show()

## (vii) Polynomial regression

DegrDegree 2ee 2

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea']].dropna()
train_df, test_df = train_test_split(df, test_size=0.2)
sns.displot(train_df['SalePrice']);

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
variables = train_df.columns.array[1:]
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])

X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
poly = PolynomialFeatures(degree = 2)
X_poly = poly.fit_transform(X_train)
poly.fit(X_poly, y_train)
model = LinearRegression()
new_model = model.fit(X_poly, y_train)

In [None]:
y_pred = new_model.predict(poly.fit_transform(X_train))
r2_score_poly = r2_score(y_train, y_pred)
mse_poly = mean_squared_error(y_train, y_pred)
print("[Poly training] r2 score:", r2_score_poly)
print("[Poly training] mean squared error:", mse_poly)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.scatterplot(X_train.reshape(-1), y_train.reshape(-1))
ax.set_xlabel("LotArea")
ax.set_ylabel("SalePrice")
ax.set_title("(Training Data) Polynomial Regression of Degree-2")
sns.lineplot(X_train.reshape(-1), y_pred.reshape(-1), color = 'red')

In [None]:
print("(Training Data) Weights/Coefficients:\n", new_model.coef_.reshape(-1))

In [None]:
y_pred = new_model.predict(poly.fit_transform(X_test))
r2_score_poly = r2_score(y_test, y_pred)
mse_poly = mean_squared_error(y_test, y_pred)
print("[Poly testing] r2 score:", r2_score_poly)
print("[Poly testing] mean squared error:", mse_poly)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.scatterplot(X_test.reshape(-1), y_test.reshape(-1))
ax.set_xlabel("LotArea")
ax.set_ylabel("SalePrice")
ax.set_title("(Testing Data) Polynomial Regression of Degree-2")
sns.lineplot(X_test.reshape(-1), y_pred.reshape(-1), color = 'red')

In [None]:
print("(Testing Data) Weights/Coefficients:\n", new_model.coef_.reshape(-1))

### Degree 3

In [None]:
df = dataset.loc[:, ['SalePrice', 'LotArea']].dropna()
train_df, test_df = train_test_split(df, test_size=0.2)
sns.displot(train_df['SalePrice']);

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
variables = train_df.columns.array[1:]
X_train = np.array(train_df[variables])
X_train = scaler.fit_transform(X_train)
y_train = np.array(train_df[['SalePrice']])

X_test = np.array(test_df[variables])
X_test = scaler.fit_transform(X_test)
y_test = np.array(test_df[['SalePrice']])

In [None]:
poly = PolynomialFeatures(degree = 3)
X_poly = poly.fit_transform(X_train)
poly.fit(X_poly, y_train)
model = LinearRegression()
new_model = model.fit(X_poly, y_train)

In [None]:
y_pred = new_model.predict(poly.fit_transform(X_train))
r2_score_poly = r2_score(y_train, y_pred)
mse_poly = mean_squared_error(y_train, y_pred)
print("[Poly training] r2 score:", r2_score_poly)
print("[Poly training] mean squared error:", mse_poly)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.scatterplot(X_train.reshape(-1), y_train.reshape(-1))
ax.set_xlabel("LotArea")
ax.set_ylabel("SalePrice")
ax.set_title("(Training Data) Polynomial Regression of Degree-3")
sns.lineplot(X_train.reshape(-1), y_pred.reshape(-1), color = 'red')

In [None]:
print("(Training Data) Weights/Coefficients:\n", new_model.coef_.reshape(-1))

In [None]:
y_pred = new_model.predict(poly.fit_transform(X_test))
r2_score_poly = r2_score(y_test, y_pred)
mse_poly = mean_squared_error(y_test, y_pred)
print("[Poly testing] r2 score:", r2_score_poly)
print("[Poly testing] mean squared error:", mse_poly)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.scatterplot(X_test.reshape(-1), y_test.reshape(-1))
ax.set_xlabel("LotArea")
ax.set_ylabel("SalePrice")
ax.set_title("(Testing Data) Polynomial Regression of Degree-3")
sns.lineplot(X_test.reshape(-1), y_pred.reshape(-1), color = 'red')

In [None]:
print("(Testing Data) Weights/Coefficients:\n", new_model.coef_.reshape(-1))