# Notebook imports


In [28]:
from sklearn.datasets import fetch_openml
# due changes on sci-learn version the example dataset for Boston House prices
# is not available, but you can download from openml site using fecth_openml module
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Gate Data

[Source: Original research paper](https://deepblue.lib.umich.edu/bitstream/handle/2027.42/22636/0000186.pdf?sequence=1&isAllowed=y)

In [29]:
boston_dataset = fetch_openml(name='boston', version=1)

In [None]:
type(boston_dataset)

In [None]:
boston_dataset


In [None]:
dir(boston_dataset)

In [None]:
print(boston_dataset.DESCR)

### Data points and features

In [None]:
type(boston_dataset.data)

In [None]:
boston_dataset.data.shape #chaining dot notation

In [None]:
boston_dataset.feature_names

In [None]:
# Actual prices in thousands of US$
boston_dataset.target

### Data exploration with Pandas Dataframe

In [31]:
# Create a panda Dataframe
data = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)

# Add a column with prices (target)
data['PRICE'] = boston_dataset.target

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.count() #number of rowd for each column

## Cleaning Data - checking for missing values

In [None]:
pd.isnull(data).any() #check for missing value in any column

In [None]:
data.info()

### Due the origin of this dataset, 2 columns (CHAS and RAD) came as category instead float and require some transformation

In [None]:
data['CHAS'] = data['CHAS'].astype('float64')
data['RAD'] = data['RAD'].astype('float64')
data.info()

## Visualing Data - Histograms, Distributions and Bar Charts 

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['PRICE'], bins=50, edgecolor='black', color='#2196F3')
plt.xlabel('Price in 000s')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
sns.histplot(data=data['PRICE'], bins=50, color='#2196F3', kde=True)
plt.show()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['RM'], edgecolor='black', color='#00796b')
plt.xlabel('Average Number of Rooms')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
data['RM'].mean()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['RAD'], edgecolor='black', color='#9C27B0')
plt.xlabel('Accessbility to Highway')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
data['RAD'].value_counts()

In [None]:
# color web site: https://www.materialpalette.com/
plt.figure(figsize=(10,6))
plt.hist(data['RAD'], bins=24, edgecolor='black', color='#9C27B0', rwidth=0.8)
plt.xlabel('Accessbility to Highways')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
frequency = data['RAD'].value_counts()
#type(frequency)
#frequency.index
#frequency.axes[0]
plt.figure(figsize=(10,6))
plt.xlabel('Accessbility to Highways')
plt.ylabel('Nr. of Houses')
plt.bar(frequency.index, height=frequency)
plt.show()

In [None]:
data['CHAS'].value_counts()

### Descritive Statistics

In [None]:
data['PRICE'].min()

In [None]:
data['PRICE'].max()

In [None]:
data.min()

In [None]:
data.max()

In [None]:
data.mean()

In [None]:
data.median()

In [None]:
data.describe()