# Supervised Learning | Regression (Model Selection & Evaluation)

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

==========

## Model Selection & Evaluation

### Model Selection

In [None]:
from IPython.display import Image
Image("imgs/ml_map.png")

Choosing the Right Estimator: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### Importing Dataset

In [None]:
real_estate = pd.read_csv('datasets/real_estate.csv')
real_estate

In [None]:
real_estate.info()

In [None]:
real_estate.describe().round(2)

### Splitting Data

In [None]:
X = real_estate['area']
y = real_estate['price']

### Exploring Data

In [None]:
plt.scatter(X,y)
plt.xlabel('Area',fontsize=20)
plt.ylabel('Price',fontsize=20)
plt.show()

In [None]:
sns.regplot(data=real_estate, x=X, y=y)

### Model Building & Training

### 1] Using Statsmodels (Statistical Approach)

Statsmodels API Modules: https://www.statsmodels.org/dev/api.html#statsmodels-api

Statsmodels OLS: https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html

In [None]:
import statsmodels.api as sm

In [None]:
X_stat = sm.add_constant(X)

In [None]:
reg_stat = sm.OLS(y,X_stat).fit()

In [None]:
reg_stat.params

In [None]:
# price = 223.178743 * area + 101912.601801

### 2] Using Scikit-Learn (Machine Learning Approach)

Scikit-Learn Linear Models Modules: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model

Scikit-Learn LinearRegression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X_ml = X.values.reshape(-1,1)

In [None]:
reg_ml = LinearRegression().fit(X_ml,y)

In [None]:
reg_ml.coef_

In [None]:
reg_ml.intercept_

In [None]:
# price = 223.178743 * area + 101912.601801

### Model Prediction & Evaluation

Linear Regression Calculator: https://www.statskingdom.com/linear-regression-calculator.html

### 1] Using Statsmodel (Statistical Approach)

##### Evaluating Results

In [None]:
reg_stat.summary()

In [None]:
plt.scatter(X,y)
y_hat = 223.178743 * X_stat + 101912.601801
fig = plt.plot(X_stat,y_hat, c='r')
plt.xlabel('Area', fontsize = 20)
plt.ylabel('Price', fontsize = 20)
plt.show()

### 2] Using Scikit-Learn (Machine Learning Approach)

##### Predicting Values

In [None]:
reg_ml.predict([[750]])

##### Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

##### Model Re-Building

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(X_train.values.reshape(-1,1), y_train)

In [None]:
y_pred = reg.predict(X_test.values.reshape(-1,1))

##### Evaluating Results

Scikit-learn Regression Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

Regression Metrics Demo: https://www.geogebra.org/m/yybenxjm

Here are three common evaluation metrics for regression problems:

**Mean Absolute Error** (MAE):
$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

**Mean Squared Error** (MSE):
$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

**Root Mean Squared Error** (RMSE):
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

All of these are **loss functions**, because we want to minimize them.

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# MAE
print(mean_absolute_error(y_test,y_pred))

In [None]:
# MSE
print(mean_squared_error(y_test,y_pred))

In [None]:
# RMSE
print(np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
np.mean(y_test)

In [None]:
np.mean(y_pred)

In [None]:
# R^2
reg_ml.score(X_train.values.reshape(-1,1),y_train)

In [None]:
reg_ml.score(X_test.values.reshape(-1,1),y_test)

In [None]:
r2_score(y_pred,y_test)

==========

# THANK YOU!