# DATA VISULAIZATION EXERCISE

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston

# Problem: The Boston House Pricing

## Dataset Description

Scikit-learn toy datasets: scikit-learn package of Python comes with a few small standard datasets that do not require to download any file from some external website.
https://scikit-learn.org/stable/datasets/toy_dataset.html

This Boston House Pricingdata has been taken from a reaserch paper, which can be found at 
[Source: Original research paper](https://deepblue.lib.umich.edu/bitstream/handle/2027.42/22636/0000186.pdf?sequence=1&isAllowed=y)

## Loading Data

In [None]:
#loading the dataset from sklearn.
boston_dataset = load_boston()
print(boston_dataset.DESCR)

## Creating Dataframe

In [None]:
#Let us create a dataframe from the two ndarrays 'data' and 'target' from boston_dataset.
dataset = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
#Add column with the price (target).
dataset['PRICE'] = boston_dataset.target
#let's see the dataframe we have created.
dataset

## Visualising Data - Histograms and Distributions

In [None]:
#Let us draw histograms for different features. 
#Recall that a histigram is a plot of frequencies of different values in a feature against these values. 
#We will use method hist of the package matplotlib for this purpose.

#Let us first see the distribution of the target, which is the house price.
plt.figure(figsize=(10, 6))
plt.hist(dataset['PRICE'], bins=50, ec='black', color='#2196f3')
#Here we randomly selected 50 bins, price is a continuous variable here. 
#You can use the method dataset['PRICE'].value_counts() to see the distinct values in this column.
#This attribute bin can also be skipped, in that case matplotlib selects an appropriate number of bins by itself.
plt.xlabel('Price in $1000s')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
#Now let us see distribution of RM which is the avarage number of rooms in the area.
#This again is a continuous variables. We are not specifying number of  bins here, letting matplotlib decide for itself.
plt.figure(figsize=(10, 6))
plt.hist(dataset['RM'], ec='black', color='#00796b')
plt.xlabel('Average Number of Rooms')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
#Now let us see distribution of RAD which indicates the index of accessibilty to highways.
plt.figure(figsize=(10, 6))
plt.hist(dataset['RAD'], bins=24, ec='black', color='#7b1fa2', rwidth=0.5)
plt.xlabel('Accessibility to Highways')
plt.ylabel('Nr. of Houses')
plt.show()

In [None]:
#TASK FOR YOU
dataset['CHAS'].value_counts()
#Run the above statement and then plot histogram for this feature CHAS which shows if the house is situated near Charle' River.

## Correlation

## $$ \rho _{XY} = corr(X,Y)$$
## $$ -1.0 \leq \rho _{XY} \leq +1.0 $$

In [None]:
#corr is the method in pandas which can directly find correlation between two quantities.
#Let us find the correlation between PRICE (target) and RM (feature).
dataset['PRICE'].corr(dataset['RM'])
#The resulting value shows moderate positive correlation

In [None]:
#Similarly let us find the correlation between PRICE (target) and PTRATIO (feature).
dataset['PRICE'].corr(dataset['PTRATIO'])
#This shows moderate negavtive correlation

In [None]:
#Now let us find all correlation coefficients at once
dataset.corr()
#Here observe that correlation between INDUS and DIS is quite high and may cause multicollinearity.
#Also note that the values are same both ways, i.e. ZN and CRIME is same as CRIME and ZN.
#So upper right diagonal is same as lower left diagonal.
#Also note that correlation of a variable with itself is always 1.

#Notice that finding higher values in this dense map is difficult, there is an easier way to do this - by using a heatmap.

In [None]:
#A heatmap is a graphical representation of data that uses a system of color-coding to represent different values. 
#The color bar on the right side is the key.

#Generating a heatmap for all correlation coefficients. 
plt.figure(figsize=(16,10))
sns.heatmap(dataset.corr(),  annot=True, annot_kws={"size": 14})
sns.set_style('white')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

### Visualizing Correlation Through Scatter Plot

In [None]:
#Let us create scatterplot between NOX and DIS to visualize correlation.

#Let us print the correlation found from corr method too in the heading to verify the results.
nox_dis_corr = round(dataset['NOX'].corr(dataset['DIS']), 3)

plt.figure(figsize=(6, 4), dpi=300)
plt.scatter(x=dataset['DIS'], y=dataset['NOX'], alpha=0.6, s=80, color='lightgreen')
plt.title(f'DIS vs NOX (Correlation {nox_dis_corr})', fontsize=14)
plt.xlabel('DIS - Distance from employment', fontsize=14)
plt.ylabel('NOX - Nitric Oxide Pollution', fontsize=14)
plt.show()
#The plot shows quite strong neagtive correlation between the two features.

In [None]:
#Another scatter plot between RM and PRICE.
rm_tgt_corr = round(dataset['RM'].corr(dataset['PRICE']), 3)
plt.figure(figsize=(9, 6))
plt.scatter(x=dataset['RM'], y=dataset['PRICE'], alpha=0.6, s=80, color='skyblue')
plt.title(f'RM vs PRICE (Correlation {rm_tgt_corr})', fontsize=14)
plt.xlabel('RM - Median nr of rooms', fontsize=14)
plt.ylabel('PRICE - property price in $1000s', fontsize=14)
plt.show()

### The PairPlot
- This is a method of seaborn package, which can be used to see all the scatterplots as well as histograms at once.
- The diagonal plots are histograms.

In [None]:
%%time 
#This time is a magic function of notebook, used to find the execution time of a piece of code.
#Since generation of pairplot will take some time, let us measure it using this magic function.
#It will print the wall time in the end.

#Generating pairplot - all correlation plots at once
sns.pairplot(dataset)
plt.show()