In [1]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

2-STEP: Import data

In [2]:
# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
df = pd.read_csv(URL)

3-STEP.DIVIDING DATA TO TRAIN AND TEST PORTIONS


In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

4-STEP.ADD OUR NEW COLUMNS USING OWN TRANSFORMERS

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

5-STEP.CHANGE NUMERIC TYPES FOR MACHINE LEARNING

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())
])

6-STEP.CHANGE STRING TYPES FOR MACHINE LEARNING

In [6]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

7-STEP: X DATA IS READY

In [7]:
X_prepared = full_pipeline.fit_transform(X_train)

In [8]:
X_prepared[0:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

MACHINE LEARNING ALGHORITMS

1-option : Linear regression

In [9]:

from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(X_prepared, y)

In [10]:
# tasodifiy 5 ta qatorni ajratib olamiz
test_data = X_train.sample(5)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
11498,-118.04,33.72,14.0,4494.0,1048.0,2222.0,963.0,4.7821,<1H OCEAN
10867,-117.88,33.7,17.0,5122.0,1544.0,2966.0,1339.0,3.4835,<1H OCEAN
5163,-118.29,33.96,39.0,1340.0,409.0,1463.0,367.0,1.5294,<1H OCEAN
13767,-117.12,34.04,25.0,2495.0,438.0,1071.0,405.0,4.8173,INLAND
18284,-122.1,37.38,37.0,4167.0,612.0,1577.0,597.0,7.5655,NEAR BAY


In [11]:
# yuqoridagi qatorlarga mos keluvchi narxlarni ajratib olamiz (biz aynan shu qiymatlarni bashorat qilishimiz kerak)
test_label = y.loc[test_data.index]
test_label

11498    169400.0
10867    116700.0
5163     111400.0
13767    146600.0
18284    500001.0
Name: median_house_value, dtype: float64

In [12]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 0.76899481, -0.90009767, -1.15919286,  0.85165616,  1.2160142 ,
         0.69967602,  1.21539643,  0.47333714, -0.32194007, -0.06819499,
         0.35076   ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.84877172, -0.90945834, -0.92113763,  1.14044744,  2.39980088,
         1.35401696,  2.20238594, -0.20861606, -0.67439809, -0.0761656 ,
         1.5275258 ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.64434338, -0.78776971,  0.8246007 , -0.59873826, -0.30906582,
         0.03214274, -0.34908695, -1.23480175, -0.74729064,  0.07681677,
         1.5926904 ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.22771205, -0.75032705, -0.28632369, -0.06760143, -0.23985249,
        -0.31261754, -0.24933801,  0.49182224,  0.30379844, -0.03908286,
        -0.64329253,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-1.25534432,  0.81290382,  0

In [13]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([278460.80462163, 258497.42427678, 141410.02545423, 185952.43970543,
       394700.8494393 ])

In [14]:
pd.DataFrame({'Predected': predicted_data, 'Real value': test_label})

Unnamed: 0,Predected,Real value
11498,278460.804622,169400.0
10867,258497.424277,116700.0
5163,141410.025454,111400.0
13767,185952.439705,146600.0
18284,394700.849439,500001.0


VALUEING THE ALGHORITMS

In [15]:
X_test = test_set.drop('median_house_value', axis=1)
X_test


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [16]:
y_test = test_set['median_house_value'].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [19]:
X_test_prepared = full_pipeline.transform(X_test)
X_test_prepared

array([[ 0.28534728,  0.1951    , -0.28632369, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06097472, -0.23549054,  0.11043502, ...,  0.        ,
         0.        ,  0.        ],
       [-1.42487026,  1.00947776,  1.85617335, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.23041404,  0.78014149, -0.28632369, ...,  0.        ,
         0.        ,  0.        ],
       [-0.08860699,  0.52740357,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.60445493, -0.66608108, -0.92113763, ...,  0.        ,
         0.        ,  0.        ]])

In [20]:
y_predicted = LR_model.predict(X_test_prepared)
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

** "Root mean square error (RMSE)"**

In [21]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72701.32600762138


2-option : DecisionTree

In [22]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

In [23]:
y_predicted = Tree_model.predict(X_test_prepared)

In [24]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72276.73129821123


3-option : RandomForest


In [25]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [26]:
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

50331.12249130154


4-option : Cross-Validatioon

In [27]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)

In [28]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [31]:
display_scores(LR_rmse_scores)

Scores: [84188.51219065 61197.24357613 86752.24346334 62289.14292385
 80540.40041898 68919.39949642 52503.82940087 90910.07884989
 77674.67507925 53941.60539478]
Mean: 71891.71307941683
Std.dev: 13249.525989444988


In [32]:
#decision_tree
scores = cross_val_score(Tree_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)


Scores: [120964.86214435  72594.56978476  83753.87649865  74848.76308466
  89921.26943115  77834.58090122  67202.81495448 101427.32649373
  95188.10164582  76091.11752061]
Mean: 85982.72824594345
Std.dev: 15451.969038455672


In [33]:
#random_forest
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [96946.01971897 47790.00665916 65357.19445816 56692.49660928
 60987.04855599 59769.67844362 46750.15359403 79241.83418425
 74629.47146429 49055.18772354]
Mean: 63721.90914112724
Std.dev: 15152.061317097347


SAVING OF MODEL

1-option : Using by PICKLE

In [34]:
#by pickle

import pickle

filename = 'RF_model.pkl' # faylga istalgan nom beramiz
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

In [35]:
with open(filename, 'rb') as file:
    model = pickle.load(file)

In [36]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [76880.90486093 64177.42525249 61211.91598369 82044.31348126
 62173.81973591]
Mean: 69297.6758628558
Std.dev: 8512.672175831616


2-option : Using by JOBLIB

In [37]:
#by joblib
import joblib

filename = 'RF_model.jbl' # faylga istalgan nom beramiz
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [38]:
model = joblib.load(filename)

In [39]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [77262.47229019 63718.21993155 61194.09623775 81074.68587104
 62334.92822594]
Mean: 69116.88051129295
Std.dev: 8333.675035076238


In [40]:
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']