# Boston Housing Price Prediction using Scikit-Learn

In [None]:
import numpy as np
import pandas as pd

In [None]:
# dataset inself is a 13 column of data which seprates from eachother using a white space
# calling pd.read_csv('housing.data') cause creating a table with one column
# to avoid that, we gonna use delim_whitespace parameter to take feature sepration stuff
# and we calling header parameter to create a row with 0 to 13 labels for boston housing data
df = pd.read_csv('housing.data', delim_whitespace=True, header=None)

In [None]:
# let's take a look
df.head()

## Boston Housing Price Prediction Features Abriviation cheatsheet

| Code   | Description   |
|:---|:---|
|**CRIM** | per capita crime rate by town |
|**ZN**  | proportion of residential land zoned for lots over 25,000 sq.ft. | 
|**INDUS**  | proportion of non-retail business acres per town | 
|**CHAS**  | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) | 
|**NOX**  | nitric oxides concentration (parts per 10 million) | 
|**RM**  | average number of rooms per dwelling | 
|**AGE**  | proportion of owner-occupied units built prior to 1940 | 
|**DIS**  | weighted distances to five Boston employment centres | 
|**RAD**  | index of accessibility to radial highways | 
|**TAX**  | full-value property-tax rate per $10,000 | 
|**PTRATIO**  | pupil-teacher ratio by town | 
|**B**  | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | 
|**LSTAT**  | % lower status of the population | 
|**MEDV**  | Median value of owner-occupied homes in \$1000's | 

In [None]:
# create a list for feature labels 
col_name = ['CRIM', 'ZN' , 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

In [None]:
# renaming the naive numberal data labels to a much more sensible names...
df.columns = col_name

In [None]:
# watch it again...
df.head()

# Exploratory Data Anaysis (EDA)

In [None]:
# take a look at dataset in more mathematical point of view
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.pairplot(df, size=1.5);
plt.show()

In [None]:
# detach a part of our feature to study more specific relations
col_study = ['ZN', 'INDUS', 'NOX', 'RM']

In [None]:
sns.pairplot(df[col_study], size=2.5);
plt.show()

In [None]:
# detach some other part of our feature to study more specific relations
col_study = ['PTRATIO', 'B', 'LSTAT', 'MEDV']

In [None]:
sns.pairplot(df[col_study], size=2.5);
plt.show()

***

# Correlation Analysis and Feature Selection

In [None]:
# see the correlation of dataset features (how and how much columns relating to each other) to absorb some clues...
df.corr()
# also pandas is awsome...

In [None]:
# output of corr() method is quite confusing
# lets create a plot of it using dear seaborn
plt.figure(figsize=(16,10))
# using the heatmap plot...
sns.heatmap(df.corr(), annot=True)
plt.show()
# so, now we can see the correlations much more easily !

# Linear Regression with Scikit-Learn

In [None]:
# let's see what we have again...
df.head()

In [None]:
# reformat to matrix ...
# it's the "average number of rooms per dwelling" feature
X = df['RM'].values.reshape(-1,1)

In [None]:
# Median value of owner-occupied homes in $1000's kinda thing
# the target variable we are gonna model (or predict i assume...)
y = df['MEDV'].values

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
model.fit(X, y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
plt.figure(figsize=(12,10));
# calling regression plot 
sns.regplot(X, y, color='green');
plt.xlabel('average number of rooms per dwelling')
plt.ylabel("Median value of owner-occupied homes in $1000's")
plt.show();

In [None]:
sns.jointplot(x='RM', y='MEDV', data=df, kind='reg', size=12, color='purple');
plt.show();