# Here is a simple End2End regression predictive modelling. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
Boston = pd.read_csv("/kaggle/input/boston-house-prices/housing.csv",  delimiter=r"\s+", names=column_names )

# All about dataset

In [None]:
Boston.head()

In [None]:
Boston.info()

> Except ****CHAS**** and ****RAD**** everything else is float types. 

In [None]:
Boston.describe()

> **CHAS** and **RAD** are categorical type.

# EDA

> ## *Target variable*

In [None]:
px.histogram(Boston, 
                   x='MEDV', 
                   marginal='box' ,
                   title='Distribution of MEDV')

> There are few outliers and most of the houses have **MEDV** in the range of 20-25.

> ## Categorical variable

In [None]:
fig, ax = plt.subplots(1,2, figsize = (10,5))
sns.countplot(ax = ax[0], data = Boston, x = 'CHAS')
ax[0].set_title('Distribution of CHAS')
sns.countplot(ax = ax[1], data = Boston, x = 'RAD')
ax[1].set_title('Distribution of RAD');

> 1. Most of the town is not tract bound by charles river.
> 2. Larger the index value, larger its accessibility to radial highway. Most of the towns have good accessibility to radial highway. 

In [None]:
fig, ax = plt.subplots(1,2, figsize = (10,5))
sns.barplot(ax = ax[0], data = Boston, x = 'CHAS', y = 'MEDV')
ax[0].set_title("Avgerage MEDV values \n for tracts bounded and non bounded by river CHARLES")
sns.barplot(ax = ax[1], data = Boston, x = 'RAD', y = 'MEDV')
ax[1].set_title("Avgerage MEDV values \n based on radial highway accessibility");

> 1. **MEDV** for riverbound area and non-bound area is of not much difference but **MEDV** is higher for riverbound area. This shows people prefer river bound area and demand for it. 
> 2. Suprisingly index 24 has low **MEDV** value. 

> ## Numerical Data

In [None]:
numerical = []
for i in Boston.columns:
    if Boston[i][0].dtype == 'float64':
        numerical.append(i)

In [None]:
numerical.pop()
numerical    

In [None]:
fig , ax = plt.subplots(4,3, figsize = (15, 15))
i = 0
j = 0

for col in numerical:
    sns.histplot(ax = ax[i,j], data = Boston, x = col, kde = True)
    ax[i,j].set_title(Boston[col].skew())
    j += 1
    if j == 3:
        j = 0
        i += 1
fig.tight_layout()

> Largely skewed attributes - **CRIM, ZN, B**

In [None]:
numerical.append('MEDV')

In [None]:
fig , ax = plt.subplots(4,3, figsize = (15, 15))
i = 0
j = 0

for col in numerical:
    sns.regplot(ax = ax[i,j], data = Boston, x = col, y = 'MEDV')
    j += 1
    if j == 3:
        j = 0
        i += 1
fig.tight_layout()

> We can see that highly skewed attributed are not much correlated to **MEDV**

> Lets do log transformation for skewed attributes.

In [None]:
Boston1 = Boston.copy()

for i in ['CRIM', 'B', 'ZN']:
    Boston1[i] = np.log1p(Boston1[i])

In [None]:
fig , ax = plt.subplots(1,3, figsize = (15, 5))
i = 0
for col in ['CRIM','ZN','B']:
    sns.regplot(ax = ax[i], data = Boston1, x = col, y = 'MEDV')
    i += 1

fig.tight_layout()

> Since **ZN** and **B** didnt give a better result after log transformation. We are only transforming **CRIM**.

In [None]:
Boston['CRIM'] = np.log1p(Boston['CRIM'])

# ML

In [None]:
y = Boston['MEDV']
X = Boston.drop('MEDV', axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1)
  
reg = linear_model.LinearRegression()
  

reg.fit(X_train, y_train)

print('Model score:',reg.score(X_test, y_test))