<a href="https://colab.research.google.com/github/alishermutalov/ML-learning/blob/ml/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [18]:
df = pd.read_csv('https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true')
df.head()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
X_train = train_set.drop('median_house_value', axis=1)
y = train_set['median_house_value'].copy()
X_num = X_train.drop('ocean_proximity', axis=1)

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household]


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

num_pipeline.fit_transform(X_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.326196  ,
        -0.17491646, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.03584338,
        -0.40283542,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.14470145,
         0.08821601, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.49697313,
        -0.60675918,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.96545045,
         0.40217517, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.68544764,
        -0.85144571,  1.69520292]])

In [21]:
from sklearn.compose import ColumnTransformer

num_attributes = list(X_num)
cat_attributes = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', OneHotEncoder(), cat_attributes)
])

In [22]:
X_prepared = full_pipeline.fit_transform(X_train)

In [23]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(X_prepared, y)

In [24]:
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
7489,-118.23,33.93,37.0,239.0,49.0,308.0,52.0,1.4028,<1H OCEAN
15996,-122.46,37.75,52.0,1207.0,152.0,465.0,162.0,10.7569,NEAR BAY
9781,-121.13,36.2,16.0,1868.0,443.0,1323.0,436.0,2.9559,<1H OCEAN
7222,-118.17,34.02,42.0,946.0,272.0,1191.0,261.0,2.45,<1H OCEAN
15720,-122.45,37.78,52.0,1345.0,291.0,560.0,294.0,3.7159,NEAR BAY
86,-122.27,37.81,40.0,880.0,451.0,582.0,380.0,0.977,NEAR BAY
18658,-121.94,36.98,24.0,3010.0,562.0,1360.0,504.0,4.2006,NEAR OCEAN
3424,-118.42,34.27,35.0,674.0,153.0,808.0,173.0,2.6667,<1H OCEAN
10100,-117.96,33.93,15.0,2014.0,419.0,839.0,390.0,4.7446,<1H OCEAN
13678,-117.24,34.04,5.0,1775.0,234.0,726.0,222.0,7.978,INLAND


In [25]:
test_labels = y.loc[test_data.index]
test_labels

Unnamed: 0,median_house_value
7489,105400.0
15996,500001.0
9781,163200.0
7222,132000.0
15720,494400.0
86,118800.0
18658,290700.0
3424,147800.0
10100,175400.0
13678,223900.0


In [28]:
test_data_prepared = full_pipeline.transform(test_data)


In [29]:
predicted_labels = LR_model.predict(test_data_prepared)
predicted_labels

array([113134.88951745, 525327.98915043, 192117.78443457, 169919.70570666,
       261280.47099822, 215960.79280741, 258824.44401234, 158740.21729523,
       245591.02282314, 281454.73297121])

In [30]:
pd.DataFrame({'Predicted': predicted_labels, 'Actual': test_labels})

Unnamed: 0,Predicted,Actual
7489,113134.889517,105400.0
15996,525327.98915,500001.0
9781,192117.784435,163200.0
7222,169919.705707,132000.0
15720,261280.470998,494400.0
86,215960.792807,118800.0
18658,258824.444012,290700.0
3424,158740.217295,147800.0
10100,245591.022823,175400.0
13678,281454.732971,223900.0


In [31]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [32]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [33]:
y_test = test_set['median_house_value'].copy()
y_test

Unnamed: 0,median_house_value
20046,47700.0
3024,45800.0
15663,500001.0
20484,218600.0
9814,278000.0
...,...
15362,263300.0
16623,266800.0
18086,500001.0
2144,72300.0


In [34]:
X_test_prepared = full_pipeline.transform(X_test)

In [35]:
y_predicted = LR_model.predict(X_test_prepared)

In [36]:
y_predicted

array([ 62006.78388667, 121643.79441299, 266990.00756469, ...,
       447883.67421698, 117237.3708917 , 185697.15157583])

In [37]:
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, y_predicted)
MAE


50903.22718681366

In [38]:
from sklearn.metrics import mean_squared_error
RMSE = np.sqrt(mean_squared_error(y_test, y_predicted))
RMSE

72717.21990877022

In [39]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [40]:
y_predicted = RF_model.predict(X_test_prepared)

In [41]:
from sklearn.metrics import mean_squared_error
RMSE = np.sqrt(mean_squared_error(y_test, y_predicted))
RMSE

49689.739763370875

In [42]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value'].copy()

In [43]:
X_prepared = full_pipeline.fit_transform(X)

In [44]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LR_model, X_prepared, y, scoring='neg_mean_squared_error', cv=5)
LR_rmse_scores = np.sqrt(-scores)

In [45]:
def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Standard deviation: ', scores.std())

In [46]:
display_scores(LR_rmse_scores)

Scores:  [73397.21068666 74817.28199289 75436.69114751 76532.07576047
 66219.27245902]
Mean:  73280.50640930946
Standard deviation:  3673.095693119273


In [47]:
scores = cross_val_score(RF_model, X_prepared, y, scoring='neg_mean_squared_error', cv=5)
RF_rmse_scores = np.sqrt(-scores)
display_scores(RF_rmse_scores)

Scores:  [84476.97934087 68595.22726692 65587.29269004 98367.19908773
 67629.55421457]
Mean:  76931.25052002366
Standard deviation:  12658.073562107951


In [48]:
import joblib
joblib.dump(RF_model, 'RF_model.jbl')

['RF_model.jbl']

In [49]:
model = joblib.load('RF_model.jbl')

In [52]:
import pickle
with open('LR_model.pkl', 'wb') as f:
    pickle.dump(LR_model, f)

In [53]:
with open('LR_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [54]:
scores = cross_val_score(model, X_prepared, y, scoring='neg_mean_squared_error', cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores:  [73397.21068666 74817.28199289 75436.69114751 76532.07576047
 66219.27245902]
Mean:  73280.50640930946
Standard deviation:  3673.095693119273
