In [43]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=47)

X_train = train_set.drop("median_house_value", axis=1)
Y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [44]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())
])

In [46]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [47]:
X_prepared = full_pipeline.fit_transform(X_train)

In [48]:
X_prepared[:5]

array([[ 1.13457498, -0.92454961, -1.56217152, -0.76871007, -0.88873485,
        -0.74981406, -0.90950767, -0.36475082,  0.32292104,  0.06230093,
        -0.62923556,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.23455722, -1.33745855, -1.00534875, -0.02440378,  0.57305529,
        -0.26559678,  0.59407809, -0.25820021, -0.71454095, -0.13223737,
         1.30521836,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.08958298, -0.81663023,  0.26738901, -0.63694963, -0.86688084,
        -0.82975988, -0.84563854, -0.10933576,  0.60083437, -0.02383541,
        -1.10560977,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-1.22500587,  0.92415633,  0.58557345, -0.56873324, -0.64105612,
        -0.38097313, -0.57685595, -0.07790412, -0.16337679,  0.04032646,
        -0.32428161,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-1.16501652,  0.78339192, -1

In [49]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(X_prepared, Y)

In [50]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
10486,-117.68,33.48,15.0,1786.0,299.0,727.0,293.0,5.0527,231400.0,<1H OCEAN
16251,-121.26,37.96,43.0,527.0,133.0,367.0,152.0,2.5000,63600.0,INLAND
8883,-118.50,34.03,52.0,1506.0,208.0,547.0,186.0,7.8705,500001.0,<1H OCEAN
15209,-117.08,33.01,5.0,5659.0,931.0,2565.0,902.0,6.1949,238700.0,<1H OCEAN
11965,-117.42,34.02,9.0,5455.0,882.0,3015.0,858.0,4.2321,162800.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
3148,-118.34,35.27,10.0,2939.0,605.0,1167.0,446.0,2.3917,79000.0,INLAND
4588,-118.27,34.05,52.0,1292.0,864.0,2081.0,724.0,0.9563,275000.0,<1H OCEAN
18378,-121.87,37.22,17.0,2825.0,365.0,1052.0,345.0,8.0595,485000.0,<1H OCEAN
6927,-118.07,34.00,42.0,1392.0,351.0,1471.0,348.0,2.6300,143800.0,<1H OCEAN


In [51]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
10486,-117.68,33.48,15.0,1786.0,299.0,727.0,293.0,5.0527,<1H OCEAN
16251,-121.26,37.96,43.0,527.0,133.0,367.0,152.0,2.5000,INLAND
8883,-118.50,34.03,52.0,1506.0,208.0,547.0,186.0,7.8705,<1H OCEAN
15209,-117.08,33.01,5.0,5659.0,931.0,2565.0,902.0,6.1949,<1H OCEAN
11965,-117.42,34.02,9.0,5455.0,882.0,3015.0,858.0,4.2321,INLAND
...,...,...,...,...,...,...,...,...,...
3148,-118.34,35.27,10.0,2939.0,605.0,1167.0,446.0,2.3917,INLAND
4588,-118.27,34.05,52.0,1292.0,864.0,2081.0,724.0,0.9563,<1H OCEAN
18378,-121.87,37.22,17.0,2825.0,365.0,1052.0,345.0,8.0595,<1H OCEAN
6927,-118.07,34.00,42.0,1392.0,351.0,1471.0,348.0,2.6300,<1H OCEAN


In [58]:
Y_test = test_set['median_house_value'].copy()
Y_test

10486    231400.0
16251     63600.0
8883     500001.0
15209    238700.0
11965    162800.0
           ...   
3148      79000.0
4588     275000.0
18378    485000.0
6927     143800.0
17733    164500.0
Name: median_house_value, Length: 4128, dtype: float64

In [53]:
X_test_prepared = full_pipeline.transform(X_test)

In [54]:
X_test_prepared[0:5]

array([[ 0.94960784, -1.01370041, -1.08489486, -0.39258542, -0.57306588,
        -0.63080381, -0.54492139,  0.62283647,  0.249303  , -0.05325764,
        -0.70311075,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.84007425,  1.08838148,  1.14239622, -0.98083503, -0.97615087,
        -0.95785488, -0.92015252, -0.72565995, -0.75382573, -0.05901752,
         0.5883234 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.53968066, -0.75563232,  1.8583112 , -0.52341139, -0.79403416,
        -0.79432934, -0.82967126,  2.11137535,  1.01304828, -0.01359887,
        -1.14847694,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.24955456, -1.23423132, -1.88035596,  1.41701801,  0.96157094,
         1.03897359,  1.07575778,  1.22621822,  0.3173393 , -0.02198389,
        -0.74713988,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.07958475, -0.76032447, -1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12187,-117.31,33.67,9.0,981.0,169.0,596.0,156.0,3.1832,157400.0,<1H OCEAN
14692,-117.11,32.79,16.0,2574.0,771.0,1129.0,721.0,3.3849,96900.0,NEAR OCEAN
11958,-117.40,33.90,32.0,1263.0,178.0,508.0,180.0,3.6667,314100.0,INLAND
813,-122.03,37.61,36.0,1409.0,271.0,1002.0,281.0,3.7262,164900.0,NEAR BAY
17567,-121.91,37.31,16.0,2962.0,898.0,1555.0,795.0,2.5804,216300.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
19280,-122.72,38.42,26.0,3604.0,734.0,2605.0,704.0,3.0969,143800.0,<1H OCEAN
11528,-118.08,33.77,26.0,2013.0,551.0,664.0,510.0,2.2708,67500.0,<1H OCEAN
14663,-117.12,32.80,29.0,2863.0,534.0,1392.0,522.0,3.8719,174200.0,NEAR OCEAN
18310,-122.12,37.42,35.0,2445.0,533.0,1187.0,519.0,5.2803,362100.0,NEAR BAY


In [63]:
y_predicted_lr = LR_model.predict(X_test_prepared)

In [64]:
y_predicted_lr

array([251672.53837878, 125380.08678546, 401979.57091103, ...,
       380645.67252871, 162906.65112127, 229369.98760092])

In [65]:
pd.DataFrame({'Bashorat':y_predicted_lr,'Asl_natija':Y_test})

Unnamed: 0,Bashorat,Asl_natija
10486,251672.538379,231400.0
16251,125380.086785,63600.0
8883,401979.570911,500001.0
15209,285794.605245,238700.0
11965,131314.373755,162800.0
...,...,...
3148,87367.872370,79000.0
4588,194680.708419,275000.0
18378,380645.672529,485000.0
6927,162906.651121,143800.0


In [71]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(Y_test, y_predicted_lr)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

69810.32251244613


In [60]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, Y)

In [67]:
y_predicted_tree = Tree_model.predict(X_test_prepared)

In [68]:
pd.DataFrame({'Bashorat':y_predicted_tree,'Asl_natija':Y_test})

Unnamed: 0,Bashorat,Asl_natija
10486,239100.0,231400.0
16251,72100.0,63600.0
8883,500001.0,500001.0
15209,218100.0,238700.0
11965,130700.0,162800.0
...,...,...
3148,104700.0,79000.0
4588,22500.0,275000.0
18378,500001.0,485000.0
6927,150900.0,143800.0


In [70]:
from sklearn.metrics import mean_squared_error
tree_mse = mean_squared_error(Y_test, y_predicted_tree)
# RMSE hisoblaymiz
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

70033.61510041234


In [72]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, Y)

In [73]:
y_predicted_rf = RF_model.predict(X_test_prepared)

In [74]:
pd.DataFrame({'Bashorat':y_predicted_tree,'Asl_natija':Y_test})

Unnamed: 0,Bashorat,Asl_natija
10486,239100.0,231400.0
16251,72100.0,63600.0
8883,500001.0,500001.0
15209,218100.0,238700.0
11965,130700.0,162800.0
...,...,...
3148,104700.0,79000.0
4588,22500.0,275000.0
18378,500001.0,485000.0
6927,150900.0,143800.0


In [75]:
from sklearn.metrics import mean_squared_error
rf_mse = mean_squared_error(Y_test, y_predicted_tree)
# RMSE hisoblaymiz
rf_rmse = np.sqrt(rf_mse)
print(rf_rmse)

70033.61510041234


In [78]:
X = df.drop('median_house_value', axis=1)
Y = df['median_house_value'].copy()
X_prepared = full_pipeline.transform(X)

In [79]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [80]:
from sklearn.model_selection import cross_val_score

In [82]:
scores = cross_val_score(LR_model, X_prepared, Y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [83]:
display_scores(LR_rmse_scores)

Scores: [84183.66301514 61191.52853899 86743.60959739 62286.73445075
 80537.25795828 68918.58661112 52504.86407192 90904.22793667
 77675.08903006 53940.95369716]
Mean: 71888.65149074617
Std.dev: 13247.67185583078


In [84]:
scores = cross_val_score(Tree_model, X_prepared, Y, scoring="neg_mean_squared_error", cv=10)
Tree_rmse_scores = np.sqrt(-scores)

In [85]:
display_scores(Tree_rmse_scores)

Scores: [116746.01904204  72315.17430871  83653.46571558  75281.4937311
  90543.7065249   77542.30950854  68064.10312153 100168.62645484
  93887.4956974   73487.32322273]
Mean: 85168.97173273639
Std.dev: 14420.410880081126


In [86]:
scores = cross_val_score(RF_model, X_prepared, Y, scoring="neg_mean_squared_error", cv=10)
RF_rmse_scores = np.sqrt(-scores)

In [87]:
display_scores(RF_rmse_scores)

Scores: [96521.3745448  47276.10613776 65070.61528474 56433.67305921
 60880.64382157 60280.95281461 47307.83151968 79409.45369776
 74264.39579009 49581.00023461]
Mean: 63702.60469048178
Std.dev: 14992.485887617575


In [88]:
import joblib

filename = 'RF_model.jbl' # faylga istalgan nom beramiz
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [89]:
joblib.dump(full_pipeline, 'full_pipline.jbl')

['full_pipline.jbl']

In [90]:
joblib.load('full_pipline.jbl')

In [91]:
joblib.load('RF_model.jbl')

In [92]:
joblib.dump(df, 'dataframe.jbl')

['dataframe.jbl']

In [93]:
joblib.load('dataframe.jbl')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
