In [None]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())             
])

In [None]:
num_pipeline.fit_transform(X_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429,  1.69520292]])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)                                   
])

In [None]:
X_prepared = full_pipeline.fit_transform(X_train)

In [None]:
X_prepared[0:5, :]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

### linear regression

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared, y)

LinearRegression()

In [None]:
test_data  = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
13837,-117.18,34.54,5.0,3772.0,619.0,2097.0,635.0,3.8194,INLAND
14008,-117.19,32.75,52.0,1495.0,230.0,459.0,190.0,8.1548,NEAR OCEAN
7025,-118.11,33.98,36.0,446.0,108.0,410.0,117.0,3.3942,<1H OCEAN
5939,-117.9,34.14,35.0,2259.0,505.0,1561.0,509.0,3.3043,<1H OCEAN
12506,-121.43,38.56,50.0,1533.0,288.0,532.0,257.0,2.5417,INLAND
17418,-120.47,34.64,8.0,2482.0,586.0,1427.0,540.0,3.071,NEAR OCEAN
9436,-119.99,37.51,14.0,2878.0,617.0,1011.0,509.0,1.398,INLAND
17016,-122.28,37.52,29.0,1526.0,355.0,724.0,315.0,4.0313,NEAR OCEAN
16444,-121.3,38.13,27.0,1004.0,192.0,470.0,192.0,2.8942,INLAND
17661,-121.88,37.27,27.0,2019.0,335.0,1020.0,351.0,5.8178,<1H OCEAN


In [None]:
test_label = y.loc[test_data.index]
test_label

13837     98500.0
14008    500001.0
7025     147200.0
5939     155500.0
12506    125900.0
17418    120400.0
9436     103800.0
17016    435200.0
16444    116700.0
17661    267400.0
Name: median_house_value, dtype: float64

In [None]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 1.19779571e+00, -5.16310455e-01, -1.87335854e+00,
         5.19638161e-01,  1.92134185e-01,  5.89739712e-01,
         3.54405574e-01, -3.22198767e-02,  2.11503336e-01,
         1.77400267e-02, -8.40679336e-01,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 1.19280965e+00, -1.35408987e+00,  1.85617335e+00,
        -5.27460160e-01, -7.36279161e-01, -8.50865725e-01,
        -8.13707018e-01,  2.24449342e+00,  1.01921979e+00,
        -5.88312809e-02, -1.01755685e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 7.34092409e-01, -7.78409043e-01,  5.86545474e-01,
        -1.00985196e+00, -1.02745250e+00, -8.93960759e-01,
        -1.00532998e+00, -2.55511497e-01, -6.79959559e-01,
         3.51786549e-02,  5.05132880e-01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 8.38799606e-01, -7.03523732e

In [None]:
predicted_labels = LR_model.predict(test_data_prepared)
predicted_labels

array([ 94873.87455544, 424702.58259327, 202207.41197974, 188269.72074022,
       125521.09275811, 224686.38071311,  46800.30538012, 261592.00168344,
       113745.64555057, 302234.85602247])

In [None]:

pd.DataFrame({'Bashorat': predicted_labels, 'Asl qiymat': test_label})

Unnamed: 0,Bashorat,Asl qiymat
13837,94873.874555,98500.0
14008,424702.582593,500001.0
7025,202207.41198,147200.0
5939,188269.72074,155500.0
12506,125521.092758,125900.0
17418,224686.380713,120400.0
9436,46800.30538,103800.0
17016,261592.001683,435200.0
16444,113745.645551,116700.0
17661,302234.856022,267400.0
