<a href="https://colab.research.google.com/github/asadbek08/DataScienceMohirdev/blob/main/ProjectML_California.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import sklearn

URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
df = pd.read_csv(URL)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state = 34)
train_set = train_set.loc[train_set['ocean_proximity']!="ISLAND"]
X_train = train_set.drop('median_house_value', axis=1)
y = train_set['median_house_value'].copy()

X_num = X_train.drop('ocean_proximity', axis=1)


# ***Pipeline***

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
num_pipe.fit_transform(X_num)

array([[ 0.78402308, -0.78971399,  0.57979495, ..., -0.53972566,
        -0.5834449 ,  0.29619968],
       [-1.20961885,  1.21093726, -2.19383145, ..., -0.48982248,
        -0.60442137,  0.73441233],
       [ 0.84897883, -0.89722562,  0.73828789, ..., -0.16545187,
        -0.61753167,  0.06049839],
       ...,
       [ 0.85397543, -0.95331865,  0.50054848, ..., -0.57002401,
        -0.22946694,  0.65638961],
       [ 0.58915582, -0.71492329,  0.26280908, ...,  0.12951151,
         0.52044194,  5.86383938],
       [ 0.78901968, -0.8411326 ,  0.02506967, ...,  0.88340584,
         0.31592134, -0.55456926]])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
text_attrib = ['ocean_proximity']

full_pipe = ColumnTransformer([
    ('num', num_pipe, num_attribs),
    ('text', OneHotEncoder(), text_attrib)
])

In [None]:
X_prepared = full_pipe.fit_transform(X_train)

# ***Linear*** ***Regression***

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(X_prepared, y)

RMSE

In [None]:
X_test = test_set.drop('median_house_value', axis=1)
y_test = test_set['median_house_value'].copy()

X_test_prepared = full_pipe.fit_transform(X_test)

y_test_predicted = LR_model.predict(X_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_test_predicted)
RMSE = np.sqrt(mse)
print(RMSE)

67816.36508400599


# ***Decision Tree***

In [None]:
from sklearn.tree import DecisionTreeRegressor

DR_model = DecisionTreeRegressor()
DR_model.fit(X_prepared, y)

RMSE

In [None]:
y_test_predicted = DR_model.predict(X_test_prepared)

In [None]:
mse = mean_squared_error(y_test, y_test_predicted)
RMSE = np.sqrt(mse)
print(RMSE)

76947.95959165072


# ***Random Forest***

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

RMSE

In [None]:
y_test_predicted = RF_model.predict(X_test_prepared)

In [None]:
mse = mean_squared_error(y_test, y_test_predicted)
RMSE = np.sqrt(mse)
print(RMSE)

56012.61827452363


# ***Cross Validation***

In [None]:
X = df.drop("median_house_value", axis=1)
y = df['median_house_value'].copy()

X_prepared = full_pipe.fit_transform(X)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

Linear Regression

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse = np.sqrt(-scores)
display_scores(LR_rmse)

Scores: [85477.75043695 59711.08139208 88064.43852045 62773.17929279
 80776.17447132 69657.18594233 52473.9731118  91548.16404321
 78177.29708863 53543.54459487]
Mean: 72220.27888944557
Std.dev: 13784.370351390746


Decision Tree

In [None]:
scores =  cross_val_score(DR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
DR_rmse = np.sqrt(-scores)
display_scores(DR_rmse)

Scores: [137824.12772316  71059.2009412   89265.17579416  76871.8788169
  86858.78573011  85827.01704987  74267.22213621 109023.0874204
 108513.80756227  74702.37679589]
Mean: 91421.26799701551
Std.dev: 20030.092563291542


Random Forest

In [97]:
scores =  cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
RF_rmse = np.sqrt(-scores)
display_scores(RF_rmse)

Scores: [106838.36166745  48682.59444214  69414.81109948  59816.95956601
  61474.37335284  66030.84803168  48096.4437546   85380.89948657
  81065.56947519  54737.21739089]
Mean: 68153.80782668435
Std.dev: 17440.260480425673


# ***Save Model***

Save by pickle

In [98]:
import pickle

filename = "RF_model.dat"
with open("RF_model.dat", 'wb') as file:
  pickle.dump(RF_model, file)

In [99]:
with open(filename, 'rb') as file:
  model = pickle.load(file)

In [100]:
scores =  cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
RF_rmse = np.sqrt(-scores)
display_scores(RF_rmse)

Scores: [106166.69944926  49291.07361064  69568.67724327  60242.31400239
  61426.73584162  66257.27658054  49738.27194759  86337.98319335
  81154.55853604  55075.33596071]
Mean: 68525.89263654048
Std.dev: 17095.918400750365


In [101]:
import joblib

pipename = 'pipeline.jlb'
joblib.dump(full_pipe, pipename)

['pipeline.jlb']