# Notebook imports


In [2]:
from sklearn.datasets import fetch_openml
# due changes on sci-learn version the example dataset for Boston House prices
# is not available, but you can download from openml site using fecth_openml module
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

# Gate Data

[Source: Original research paper](https://deepblue.lib.umich.edu/bitstream/handle/2027.42/22636/0000186.pdf?sequence=1&isAllowed=y)

In [3]:
boston_dataset = fetch_openml(name='boston', version=1)

In [None]:
type(boston_dataset)

In [None]:
boston_dataset


In [None]:
dir(boston_dataset)

In [None]:
print(boston_dataset.DESCR)

### Data points and features

In [None]:
type(boston_dataset.data)

In [None]:
boston_dataset.data.shape #chaining dot notation

In [None]:
boston_dataset.feature_names

In [None]:
# Actual prices in thousands of US$
boston_dataset.target

### Data exploration with Pandas Dataframe

In [12]:
# Create a panda Dataframe
data = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)

# Add a column with prices (target)
data['PRICE'] = boston_dataset.target

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.count() #number of rowd for each column

## Cleaning Data - checking for missing values

In [None]:
pd.isnull(data).any() #check for missing value in any column

In [None]:
data.info()

### Due the origin of this dataset, 2 columns (CHAS and RAD) came as category instead float and require some transformation

In [None]:
data['CHAS'] = data['CHAS'].astype('float64')
data['RAD'] = data['RAD'].astype('float64')
data.info()

## Visualing Data - Histograms, Distributions and Bar Charts 

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['PRICE'], bins=50, edgecolor='black', color='#2196F3')
plt.xlabel('Price in 000s')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
sns.histplot(data=data['PRICE'], bins=50, color='#2196F3', kde=True)
plt.show()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['RM'], edgecolor='black', color='#00796b')
plt.xlabel('Average Number of Rooms')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
data['RM'].mean()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['RAD'], edgecolor='black', color='#9C27B0')
plt.xlabel('Accessbility to Highway')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
data['RAD'].value_counts()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['RAD'], bins=24, edgecolor='black', color='#9C27B0', rwidth=0.8)
plt.xlabel('Accessbility to Highways')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
frequency = data['RAD'].value_counts()
#type(frequency)
#frequency.index
#frequency.axes[0]
plt.figure(figsize=(10,6))
plt.xlabel('Accessbility to Highways')
plt.ylabel('Nr. of Houses')
plt.bar(frequency.index, height=frequency)
plt.show()

In [None]:
data['CHAS'].value_counts()

### Descritive Statistics

In [None]:
data['PRICE'].min()

In [None]:
data['PRICE'].max()

In [None]:
data.min()

In [None]:
data.max()

In [None]:
data.mean()

In [None]:
data.median()

In [None]:
data.describe()

## Correlation
## $$ \rho _{XY} = corr(X,Y)$$
## $$ -1.0 \leq \rho _{XY} \leq +1.0 $$

In [None]:
# Check the correlation between price and number of rooms on our dataset
data['PRICE'].corr(data['RM'])

In [None]:
# Chalenge: calculate the correlation between property prices and the pupil teacher ratio
data['PRICE'].corr(data['PTRATIO'])

In [None]:
data.corr() # Pearson Correlation Coefficients

In [38]:
mask = np.zeros_like(data.corr())
triangle_indices = np.triu_indices_from(mask)
mask[triangle_indices]=True

In [None]:
plt.Figure(figsize=(16,10))
sns.heatmap(data.corr(), mask=mask, annot=True, annot_kws={"size":8}, cmap='bwr')
sns.set_style('white')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show

# Challenge: Picture the relationship between pollution and distance in your head
# Then create a scatter plot between DIS and NOX

In [None]:
nox_dis_corr = data['NOX'].corr(data['DIS'])
plt.figure(figsize=(9,6))
plt.scatter(x=data['DIS'], y=data['NOX'], alpha=0.6, s=80, color='indigo')
plt.title(f'DIS vs NOX (Correlation {nox_dis_corr:.3f})', fontsize=14)
plt.xlabel('DIS - Distance from employment', fontsize=14)
plt.ylabel('NOX - Nitric Oxide Pollution', fontsize=14)
plt.show()

In [None]:
# Same thing but using sns library
# set sns style
sns.set_style('whitegrid')
sns.jointplot(x=data['DIS'], y=data['NOX'], color='indigo', joint_kws={'alpha':0.5})
nox_dis_corr = round(data['NOX'].corr(data['DIS']),3)
# new version of sns do not show the correlation on graph, so add using plt.text will fix it
plt.text(x=9, y=0.8, s=f'Pearsonr = {nox_dis_corr}', color='black')

plt.show()

In [None]:
# Same thing but using sns library
# set sns style
sns.set_style('whitegrid')
sns.jointplot(x=data['DIS'], y=data['NOX'], color='blue', kind='hex')
nox_dis_corr = round(data['NOX'].corr(data['DIS']),3)
# new version of sns do not show the correlation on graph, so add using plt.text will fix it
plt.text(x=9, y=0.8, s=f'Pearsonr = {nox_dis_corr}', color='black')

plt.show()

In [None]:
# Same thing but using sns library
# set sns style
sns.set_style('whitegrid')
sns.jointplot(x=data['TAX'], y=data['RAD'], color='darkred')
tax_rad_corr = round(data['TAX'].corr(data['RAD']),3)
# new version of sns do not show the correlation on graph, so add using plt.text will fix it
plt.text(x=200, y=20, s=f'Pearsonr = {tax_rad_corr}', color='black')

plt.show()

In [None]:
sns.lmplot(x='TAX', y='RAD', data=data, height=7)
plt.show()

## Chalenge: create a scatter plot between the house prices and the numebr of rooms (RM)

In [None]:
rm_price_corr = data['RM'].corr(data['PRICE'])
plt.figure(figsize=(9,6))
plt.scatter(x=data['RM'], y=data['PRICE'], alpha=0.6, s=80, color='skyblue')
plt.title(f'RM vs Price (Correlation {rm_price_corr:.3f})', fontsize=14)
plt.xlabel('RM - Number of Rooms', fontsize=14)
plt.ylabel('Price in 000s', fontsize=14)
plt.show()

In [None]:
sns.lmplot(x='RM', y='PRICE', data=data, height=7)
plt.show()

In [None]:
%%time
sns.pairplot(data)

In [None]:
%%time
sns.pairplot(data=data, kind='reg', plot_kws={'line_kws':{'color':'cyan'}})
plt.show()

## Training & Test Dataset Split

In [None]:
prices = data['PRICE']
features = data.drop('PRICE', axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, prices,
                                                    test_size=0.2, random_state=10)

# % of training set
len(X_train)/len(features)

In [None]:
# % of test data set
X_test.shape[0]/features.shape[0]

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)

In [None]:
# Showing the Thetas
print('Intercept', regr.intercept_)
pd.DataFrame(data=regr.coef_, index=X_train.columns, columns=['Coef'])