In [36]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import plotly.express as px
import plotly.figure_factory as ff

In [37]:
def load_dataset():
	return pd.read_csv("boston.csv")
boston_dataframe = load_dataset()


In [38]:
def print_summarize_dataset(dataset):
    print("Dataset dimension:")
    print(dataset.shape)
    print("First 10 rows of dataset: ")
    print(dataset.head().to_string(index=False))
    print("Statistical summary: ")
    print(dataset.describe().to_string(index=False))
    print(dataset.info())

print_summarize_dataset(boston_dataframe)


Dataset dimension:
(506, 14)
First 10 rows of dataset: 
   CRIM   ZN  INDUS  CHAS   NOX    RM  AGE    DIS  RAD   TAX  PTRATIO      B  LSTAT  MDEV
0.00632 18.0   2.31   0.0 0.538 6.575 65.2 4.0900  1.0 296.0     15.3 396.90   4.98  24.0
0.02731  0.0   7.07   0.0 0.469 6.421 78.9 4.9671  2.0 242.0     17.8 396.90   9.14  21.6
0.02729  0.0   7.07   0.0 0.469 7.185 61.1 4.9671  2.0 242.0     17.8 392.83   4.03  34.7
0.03237  0.0   2.18   0.0 0.458 6.998 45.8 6.0622  3.0 222.0     18.7 394.63   2.94  33.4
0.06905  0.0   2.18   0.0 0.458 7.147 54.2 6.0622  3.0 222.0     18.7 396.90   5.33  36.2
Statistical summary: 
      CRIM         ZN      INDUS       CHAS        NOX         RM        AGE        DIS        RAD        TAX    PTRATIO          B      LSTAT       MDEV
506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
  3.593761  11.363636  11.136779   0.069170   0.554695   6.284634  68.5749

In [39]:
def clean_dataset(boston_dataframe):
    boston_dataframe["ZN"] = boston_dataframe["ZN"].astype(int)
    boston_dataframe["CHAS"] = boston_dataframe["CHAS"].astype(int)
    boston_dataframe["AGE"] = boston_dataframe["AGE"].astype(int)
    boston_dataframe["RM"] = round(boston_dataframe["RM"]).astype(int)
    boston_dataframe["RAD"] = boston_dataframe["RAD"].astype(int)
    boston_dataframe["TAX"] = boston_dataframe["TAX"].astype(int)
    boston_dataframe["PTRATIO"] = round(boston_dataframe["PTRATIO"]).astype(int)
    boston_dataframe["B"] = round(boston_dataframe["B"]).astype(int)
    boston_dataframe["MDEV"] = round(boston_dataframe["MDEV"]).astype(int)
clean = clean_dataset(boston_dataframe)



In [40]:
cor = boston_dataframe.corr()
cor = px.imshow(cor, title="Correlation",
                text_auto=True, width=1500, height=950)
cor.show()


In [41]:
def print_histograms(boston_dataframe):
    fig_1 = px.histogram(boston_dataframe, x="CRIM", y="RAD", title= "CRIM and RAD",text_auto=True)
    fig_2 = px.histogram(boston_dataframe, x="DIS", y="ZN", title="DIS and ZN", text_auto=True)
    fig_3 = px.histogram(boston_dataframe, x="NOX", y="INDUS", title="NOX and INDUS", text_auto=True)
    fig_4 = px.histogram(boston_dataframe, x="NOX", y="CHAS", title="NOX and CHAS", text_auto=True)
    fig_5 = px.histogram(boston_dataframe, x="NOX", y="AGE", title="NOX and AGE", text_auto=True)
    fig_6 = px.histogram(boston_dataframe, x="RM", y="CHAS", title="RM and CHAS", text_auto=True)
    fig_7 = px.histogram(boston_dataframe, x="AGE", y="INDUS", title="AGE and INDUS", text_auto=True)
    fig_8 = px.histogram(boston_dataframe, x="INDUS", y="LSTAT", title="INDUS and LSTAT", text_auto=True)
    fig_9 = px.histogram(boston_dataframe, x="RM", y="MDEV", title="RM and MDEV", text_auto=True)

    fig_1.show()
    fig_2.show()
    fig_3.show()
    fig_4.show()
    fig_5.show()
    fig_6.show()
    fig_7.show()
    fig_8.show()
    fig_9.show()

print_histograms(boston_dataframe)


In [42]:
def compute_correlations_matrix(boston_dataframe):
	return boston_dataframe.corr()

correlations = compute_correlations_matrix(boston_dataframe)
print(correlations['MDEV'])


CRIM      -0.386672
ZN         0.358083
INDUS     -0.483168
CHAS       0.175863
NOX       -0.424821
RM         0.675437
AGE       -0.375892
DIS        0.248844
RAD       -0.380368
TAX       -0.467151
PTRATIO   -0.504551
B          0.331937
LSTAT     -0.738176
MDEV       1.000000
Name: MDEV, dtype: float64


In [47]:
def print_scatter_matrix(boston_dataframe):
    fig = ff.create_scatterplotmatrix(boston_dataframe, diag="box", width=3000, height=1000)
    fig.show()

    fig_rm = px.scatter(boston_dataframe, x="RM", y="MDEV", title="MDEV and RM")
    fig_lstat = px.scatter(boston_dataframe, x="LSTAT", y="MDEV", title="MDEV and LSTAT")
    fig_age = px.scatter(boston_dataframe, x="AGE", y="MDEV", title="MDEV and AGE")
    fig_crim = px.scatter(boston_dataframe, x="CRIM", y="MDEV", title="MDEV and CRIME")

    # LSTAT
    lstat_age = px.scatter(boston_dataframe, x="AGE", y="LSTAT", title="LSTAT and AGE")
    lstat_crim = px.scatter(boston_dataframe, x="LSTAT", y="CRIM", title="LSTAT and CRIM")
    lstat_indus = px.scatter(boston_dataframe, x="LSTAT", y="INDUS", title="LSTAT and INDUS")

    fig_rm.show()
    fig_lstat.show()
    fig_age.show()
    fig_crim.show()

    print("\n")

    lstat_age.show()
    lstat_crim.show()
    lstat_indus.show()

print_scatter_matrix(boston_dataframe)






`From the first four diagrams, you can see only one real connection between them, which is MDEV and LSTAT. That means the slightest change in one of them will lead to a change in the data. But wait, what if there are other connections with LSTAT, like AGE or INDUS? If there are connections, they will directly affect MDEV`

In [44]:
def boston_fit_model(boston_dataframe):
    model_dataset = boston_dataframe[["RM", "MDEV"]]
    regressor = LinearRegression()
    x = model_dataset.iloc[:, :-1].values
    y = model_dataset.iloc[:, 1].values
    regressor.fit(x, y)
    
    return regressor


regressor = boston_fit_model(boston_dataframe)
array_to_predict_1 = boston_dataframe["RM"].values.reshape(-1,1)

def boston_predict(estimator, array_to_predict):
    return estimator.predict(array_to_predict)


predict = boston_predict(regressor, array_to_predict_1)


In [45]:
data = [1, 2, 3]
data = np.array(data).reshape(-1, 1)
estimator = boston_fit_model(boston_dataframe)
print(boston_predict(estimator, data))
    

[-22.36776753 -13.8446567   -5.32154587]


In [46]:
def print_model_prediction_evaluator(base_test, prediction):
    print('Mean Absolute Error:', metrics.mean_absolute_error(base_test, prediction))
    print('Mean Squared Error:', metrics.mean_squared_error(base_test, prediction))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(base_test, prediction)))


print_model_prediction_evaluator(array_to_predict_1, predict)


Mean Absolute Error: 16.27072005991654
Mean Squared Error: 294.30934105019185
Root Mean Squared Error: 17.155446396121317
