In [None]:
%pip install scikit-learn
%pip install matplotlib
%pip install plotly-express
%pip install seaborn

In [ ]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sn
import numpy as np

In [ ]:
data = "boston.csv"

In [ ]:
def load_dataset(path):
    return pd.read_csv(path)
boston_dataframe = load_dataset(data)

In [ ]:
def print_summarize_dataset(dataset):
    print("Dataset dimension:")
    print(dataset.shape)
    print("First 10 rows of dataset:")
    print(dataset.head(10))
    print("Summarize")
    print(dataset.describe())

In [ ]:
def clean_dataset(boston_dataframe):
    return boston_dataframe.isnull().sum()

In [ ]:
def print_histograms(boston_dataframe):
    fig = make_subplots(rows=5, cols=3, subplot_titles=[
        "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"
    ])
    
    fig.add_trace(go.Histogram(x=boston_dataframe['CRIM'], name='CRIM', nbinsx=20), col=1, row=1)
    fig.add_trace(go.Histogram(x=boston_dataframe['ZN'], name='ZN', nbinsx=20), col=2, row=1)
    fig.add_trace(go.Histogram(x=boston_dataframe['INDUS'], name='INDUS', nbinsx=20), col=3, row=1)
    fig.add_trace(go.Histogram(x=boston_dataframe['CHAS'], name='CHAS', nbinsx=20), col=1, row=2)
    fig.add_trace(go.Histogram(x=boston_dataframe['NOX'], name='NOX', nbinsx=20), col=2, row=2)
    fig.add_trace(go.Histogram(x=boston_dataframe['RM'], name='RM', nbinsx=20), col=3, row=2)
    fig.add_trace(go.Histogram(x=boston_dataframe['AGE'], name='AGE', nbinsx=20), col=1, row=3)
    fig.add_trace(go.Histogram(x=boston_dataframe['DIS'], name='DIS', nbinsx=20), col=2, row=3)
    fig.add_trace(go.Histogram(x=boston_dataframe['RAD'], name='RAD', nbinsx=20), col=3, row=3)
    fig.add_trace(go.Histogram(x=boston_dataframe['TAX'], name='TAX', nbinsx=20), col=1, row=4)
    fig.add_trace(go.Histogram(x=boston_dataframe['PTRATIO'], name='PTRATIO', nbinsx=20), col=2, row=4)
    fig.add_trace(go.Histogram(x=boston_dataframe['B'], name='B', nbinsx=20), col=3, row=4)
    fig.add_trace(go.Histogram(x=boston_dataframe['LSTAT'], name='LSTAT', nbinsx=20), col=1, row=5)
    
    fig.update_layout(height=800, width=1600, title="Distribution of Boston Dataframe")
    
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ))
    
    return fig.show()
print_histograms(boston_dataframe)

In [ ]:
def compute_correlations_matrix(boston_dataframe):
    return boston_dataframe.corr()
correlations = compute_correlations_matrix(boston_dataframe)

In [ ]:
fig = px.imshow(correlations, width=700, height=700, title="Correlation of dataframe", text_auto=True)
fig.show()

In [ ]:
fig = px.scatter(correlation_df, x=boston_dataframe['RM'], y=boston_dataframe['MDEV'])
fig.update_layout(title='Correlation coefficient between the median value and the number of rooms', xaxis_title='MDEV', yaxis_title='RM')
fig.show()

In [ ]:
correlations['MDEV']

In [ ]:
def print_scatter_matrix(boston_dataframe):
    sn.pairplot(boston_dataframe)
    plt.show()
print_scatter_matrix(boston_dataframe)

In [ ]:
fig = px.scatter(boston_dataframe, x="RM", y="MDEV", title="Median Value vs. Number of Rooms")
fig.show()

In [ ]:
fig = make_subplots(rows=1, cols=3, subplot_titles=['LSTAT', 'AGE', 'CRIM'])


fig.add_trace(go.Scatter(x=boston_dataframe['LSTAT'], y=boston_dataframe['MDEV'], mode='markers', name='LSTAT'), col=1, row=1)
fig.add_trace(go.Scatter(x=boston_dataframe['AGE'], y=boston_dataframe['MDEV'], mode='markers', name='AGE'), col=2, row=1)
fig.add_trace(go.Scatter(x=boston_dataframe['CRIM'], y=boston_dataframe['MDEV'], mode='markers', name='CRIM'), col=3, row=1)

fig.show()

In [ ]:
fig = px.scatter(boston_dataframe, x='AGE', y='LSTAT')
fig.show()

In [ ]:
def boston_fit_model(boston_dataframe):
    model_dataset = boston_dataframe[['RM', 'MDEV']]
    regressor = LinearRegression()
    x = model_dataset.iloc[:, :-1].values
    y = model_dataset.iloc[:, 1].values
    regressor.fit(x, y)
    return regressor

In [ ]:
def boston_predict(estimator, array_to_predict):
    return estimator.predict(array_to_predict)

In [ ]:
data = np.array([1, 2, 3]).reshape(-1, 1)
estimator = boston_fit_model(boston_dataframe)
y_pred = boston_predict(estimator, data)
y_pred

In [ ]:
from sklearn.model_selection import train_test_split
model_dataset = boston_dataframe[['RM', 'MDEV']]
regressor = LinearRegression()
x = model_dataset.iloc[:, :-1].values
y = model_dataset.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
y_pred = boston_predict(estimator, X_test)
r2_score(y_test, y_pred)

In [ ]:
def print_model_prediction_evaluator(base_test, prediction):
  print('Mean Absolute Error:', metrics.mean_absolute_error(base_test, prediction))
  print('Mean Squared Error:', metrics.mean_squared_error(base_test, prediction))
  print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(base_test, prediction)))
print_model_prediction_evaluator(y_test, y_pred)