In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import seaborn as sns

In [2]:
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, train_size=0.8, random_state=47)

In [4]:
train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12187,-117.31,33.67,9.0,981.0,169.0,596.0,156.0,3.1832,157400.0,<1H OCEAN
14692,-117.11,32.79,16.0,2574.0,771.0,1129.0,721.0,3.3849,96900.0,NEAR OCEAN
11958,-117.4,33.9,32.0,1263.0,178.0,508.0,180.0,3.6667,314100.0,INLAND
813,-122.03,37.61,36.0,1409.0,271.0,1002.0,281.0,3.7262,164900.0,NEAR BAY
17567,-121.91,37.31,16.0,2962.0,898.0,1555.0,795.0,2.5804,216300.0,<1H OCEAN


In [5]:
housing = train_set.drop(['median_house_value', 'ocean_proximity'], axis=1)
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12187,-117.31,33.67,9.0,981.0,169.0,596.0,156.0,3.1832
14692,-117.11,32.79,16.0,2574.0,771.0,1129.0,721.0,3.3849
11958,-117.40,33.90,32.0,1263.0,178.0,508.0,180.0,3.6667
813,-122.03,37.61,36.0,1409.0,271.0,1002.0,281.0,3.7262
17567,-121.91,37.31,16.0,2962.0,898.0,1555.0,795.0,2.5804
...,...,...,...,...,...,...,...,...
19280,-122.72,38.42,26.0,3604.0,734.0,2605.0,704.0,3.0969
11528,-118.08,33.77,26.0,2013.0,551.0,664.0,510.0,2.2708
14663,-117.12,32.80,29.0,2863.0,534.0,1392.0,522.0,3.8719
18310,-122.12,37.42,35.0,2445.0,533.0,1187.0,519.0,5.2803


In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12187 to 5255
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16354 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
dtypes: float64(8)
memory usage: 1.1 MB


In [7]:
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
dtype: int64

In [8]:
housing['total_bedrooms'].median()

433.0

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
impute = SimpleImputer(strategy='median')

In [11]:
X = impute.fit(housing)
X.statistics_

array([-118.52  ,   34.26  ,   29.    , 2126.5   ,  433.    , 1165.    ,
        408.    ,    3.5417])

In [12]:
X = impute.transform(housing)
X

array([[-117.31  ,   33.67  ,    9.    , ...,  596.    ,  156.    ,
           3.1832],
       [-117.11  ,   32.79  ,   16.    , ..., 1129.    ,  721.    ,
           3.3849],
       [-117.4   ,   33.9   ,   32.    , ...,  508.    ,  180.    ,
           3.6667],
       ...,
       [-117.12  ,   32.8   ,   29.    , ..., 1392.    ,  522.    ,
           3.8719],
       [-122.12  ,   37.42  ,   35.    , ..., 1187.    ,  519.    ,
           5.2803],
       [-118.48  ,   34.07  ,   40.    , ..., 1564.    ,  523.    ,
           8.5153]])

In [13]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12187,-117.31,33.67,9.0,981.0,169.0,596.0,156.0,3.1832
14692,-117.11,32.79,16.0,2574.0,771.0,1129.0,721.0,3.3849
11958,-117.40,33.90,32.0,1263.0,178.0,508.0,180.0,3.6667
813,-122.03,37.61,36.0,1409.0,271.0,1002.0,281.0,3.7262
17567,-121.91,37.31,16.0,2962.0,898.0,1555.0,795.0,2.5804
...,...,...,...,...,...,...,...,...
19280,-122.72,38.42,26.0,3604.0,734.0,2605.0,704.0,3.0969
11528,-118.08,33.77,26.0,2013.0,551.0,664.0,510.0,2.2708
14663,-117.12,32.80,29.0,2863.0,534.0,1392.0,522.0,3.8719
18310,-122.12,37.42,35.0,2445.0,533.0,1187.0,519.0,5.2803


In [14]:
housing_tr = pd.DataFrame(X, columns=housing.columns, index=housing.index)
housing_tr

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12187,-117.31,33.67,9.0,981.0,169.0,596.0,156.0,3.1832
14692,-117.11,32.79,16.0,2574.0,771.0,1129.0,721.0,3.3849
11958,-117.40,33.90,32.0,1263.0,178.0,508.0,180.0,3.6667
813,-122.03,37.61,36.0,1409.0,271.0,1002.0,281.0,3.7262
17567,-121.91,37.31,16.0,2962.0,898.0,1555.0,795.0,2.5804
...,...,...,...,...,...,...,...,...
19280,-122.72,38.42,26.0,3604.0,734.0,2605.0,704.0,3.0969
11528,-118.08,33.77,26.0,2013.0,551.0,664.0,510.0,2.2708
14663,-117.12,32.80,29.0,2863.0,534.0,1392.0,522.0,3.8719
18310,-122.12,37.42,35.0,2445.0,533.0,1187.0,519.0,5.2803


In [15]:
housing_tr.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64

In [16]:
housing.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
longitude,1.0,-0.924394,-0.105275,0.03535,0.061793,0.093474,0.047287,-0.020613
latitude,-0.924394,1.0,0.008526,-0.026784,-0.059656,-0.103499,-0.064199,-0.074859
housing_median_age,-0.105275,0.008526,1.0,-0.360821,-0.317394,-0.298456,-0.29962,-0.118419
total_rooms,0.03535,-0.026784,-0.360821,1.0,0.92848,0.859616,0.916287,0.197493
total_bedrooms,0.061793,-0.059656,-0.317394,0.92848,1.0,0.882965,0.979125,-0.011059
population,0.093474,-0.103499,-0.298456,0.859616,0.882965,1.0,0.912238,0.002311
households,0.047287,-0.064199,-0.29962,0.916287,0.979125,0.912238,1.0,0.010175
median_income,-0.020613,-0.074859,-0.118419,0.197493,-0.011059,0.002311,0.010175,1.0


In [17]:
housing_tr.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
longitude,1.0,-0.924394,-0.105275,0.03535,0.061323,0.093474,0.047287,-0.020613
latitude,-0.924394,1.0,0.008526,-0.026784,-0.059213,-0.103499,-0.064199,-0.074859
housing_median_age,-0.105275,0.008526,1.0,-0.360821,-0.315894,-0.298456,-0.29962,-0.118419
total_rooms,0.03535,-0.026784,-0.360821,1.0,0.925038,0.859616,0.916287,0.197493
total_bedrooms,0.061323,-0.059213,-0.315894,0.925038,1.0,0.877993,0.973054,-0.010847
population,0.093474,-0.103499,-0.298456,0.859616,0.877993,1.0,0.912238,0.002311
households,0.047287,-0.064199,-0.29962,0.916287,0.973054,0.912238,1.0,0.010175
median_income,-0.020613,-0.074859,-0.118419,0.197493,-0.010847,0.002311,0.010175,1.0


In [18]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12187,-117.31,33.67,9.0,981.0,169.0,596.0,156.0,3.1832,157400.0,<1H OCEAN
14692,-117.11,32.79,16.0,2574.0,771.0,1129.0,721.0,3.3849,96900.0,NEAR OCEAN
11958,-117.40,33.90,32.0,1263.0,178.0,508.0,180.0,3.6667,314100.0,INLAND
813,-122.03,37.61,36.0,1409.0,271.0,1002.0,281.0,3.7262,164900.0,NEAR BAY
17567,-121.91,37.31,16.0,2962.0,898.0,1555.0,795.0,2.5804,216300.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
19280,-122.72,38.42,26.0,3604.0,734.0,2605.0,704.0,3.0969,143800.0,<1H OCEAN
11528,-118.08,33.77,26.0,2013.0,551.0,664.0,510.0,2.2708,67500.0,<1H OCEAN
14663,-117.12,32.80,29.0,2863.0,534.0,1392.0,522.0,3.8719,174200.0,NEAR OCEAN
18310,-122.12,37.42,35.0,2445.0,533.0,1187.0,519.0,5.2803,362100.0,NEAR BAY


In [19]:
housing_cat = train_set[['ocean_proximity']]
housing_cat

Unnamed: 0,ocean_proximity
12187,<1H OCEAN
14692,NEAR OCEAN
11958,INLAND
813,NEAR BAY
17567,<1H OCEAN
...,...
19280,<1H OCEAN
11528,<1H OCEAN
14663,NEAR OCEAN
18310,NEAR BAY


In [20]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_enc = OrdinalEncoder()

In [23]:
ordinal_enc.fit_transform(housing_cat)[:10]

array([[0.],
       [4.],
       [1.],
       [3.],
       [0.],
       [4.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [24]:
housing_cat.head(10)

Unnamed: 0,ocean_proximity
12187,<1H OCEAN
14692,NEAR OCEAN
11958,INLAND
813,NEAR BAY
17567,<1H OCEAN
14838,NEAR OCEAN
968,INLAND
1179,INLAND
2281,INLAND
16425,INLAND


In [25]:
from sklearn.preprocessing import OneHotEncoder

one_hot_enc = OneHotEncoder()

In [28]:
housing_cat_1hot = one_hot_enc.fit_transform(housing_cat)
housing_cat_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.]])

In [29]:
x = pd.get_dummies(housing_cat)
x

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12187,True,False,False,False,False
14692,False,False,False,False,True
11958,False,True,False,False,False
813,False,False,False,True,False
17567,True,False,False,False,False
...,...,...,...,...,...
19280,True,False,False,False,False
11528,True,False,False,False,False
14663,False,False,False,False,True
18310,False,False,False,True,False


In [30]:
x = pd.get_dummies(housing_cat, dtype='float')
x

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12187,1.0,0.0,0.0,0.0,0.0
14692,0.0,0.0,0.0,0.0,1.0
11958,0.0,1.0,0.0,0.0,0.0
813,0.0,0.0,0.0,1.0,0.0
17567,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
19280,1.0,0.0,0.0,0.0,0.0
11528,1.0,0.0,0.0,0.0,0.0
14663,0.0,0.0,0.0,0.0,1.0
18310,0.0,0.0,0.0,1.0,0.0


In [31]:
from sklearn.base import BaseEstimator, TransformerMixin

# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [35]:
att_adder = CombinedAttributesAdder()
y = att_adder.fit_transform(train_set.values)
pd.DataFrame(y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-117.31,33.67,9.0,981.0,169.0,596.0,156.0,3.1832,157400.0,<1H OCEAN,6.288462,3.820513,0.172273
1,-117.11,32.79,16.0,2574.0,771.0,1129.0,721.0,3.3849,96900.0,NEAR OCEAN,3.570042,1.565881,0.299534
2,-117.4,33.9,32.0,1263.0,178.0,508.0,180.0,3.6667,314100.0,INLAND,7.016667,2.822222,0.140934
3,-122.03,37.61,36.0,1409.0,271.0,1002.0,281.0,3.7262,164900.0,NEAR BAY,5.014235,3.565836,0.192335
4,-121.91,37.31,16.0,2962.0,898.0,1555.0,795.0,2.5804,216300.0,<1H OCEAN,3.725786,1.955975,0.303174
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-122.72,38.42,26.0,3604.0,734.0,2605.0,704.0,3.0969,143800.0,<1H OCEAN,5.119318,3.700284,0.203663
16508,-118.08,33.77,26.0,2013.0,551.0,664.0,510.0,2.2708,67500.0,<1H OCEAN,3.947059,1.301961,0.273721
16509,-117.12,32.8,29.0,2863.0,534.0,1392.0,522.0,3.8719,174200.0,NEAR OCEAN,5.484674,2.666667,0.186518
16510,-122.12,37.42,35.0,2445.0,533.0,1187.0,519.0,5.2803,362100.0,NEAR BAY,4.710983,2.287091,0.217996


In [36]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
min_max_scaler.fit_transform(housing)

array([[0.70119522, 0.12008502, 0.15686275, ..., 0.02076112, 0.02548923,
        0.18505262],
       [0.72111554, 0.02656748, 0.29411765, ..., 0.03942163, 0.11840158,
        0.19896277],
       [0.69223108, 0.1445271 , 0.60784314, ..., 0.01768022, 0.02943595,
        0.21839699],
       ...,
       [0.72011952, 0.02763018, 0.54901961, ..., 0.04862935, 0.0856767 ,
        0.23254852],
       [0.22211155, 0.51859724, 0.66666667, ..., 0.04145223, 0.08518336,
        0.32967821],
       [0.58466135, 0.16259299, 0.76470588, ..., 0.05465112, 0.08584114,
        0.55277858]])

In [37]:
pd.DataFrame(min_max_scaler.fit_transform(housing))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.701195,0.120085,0.156863,0.024900,0.026071,0.020761,0.025489,0.185053
1,0.721116,0.026567,0.294118,0.065415,0.119491,0.039422,0.118402,0.198963
2,0.692231,0.144527,0.607843,0.032072,0.027467,0.017680,0.029436,0.218397
3,0.231076,0.538789,0.686275,0.035785,0.041899,0.034975,0.046045,0.222500
4,0.243028,0.506908,0.294118,0.075284,0.139199,0.054336,0.130571,0.143481
...,...,...,...,...,...,...,...,...
16507,0.162351,0.624867,0.490196,0.091612,0.113749,0.091097,0.115606,0.179101
16508,0.624502,0.130712,0.490196,0.051147,0.085351,0.023142,0.083703,0.122129
16509,0.720120,0.027630,0.549020,0.072766,0.082713,0.048629,0.085677,0.232549
16510,0.222112,0.518597,0.666667,0.062134,0.082557,0.041452,0.085183,0.329678


In [41]:
pd.DataFrame(min_max_scaler.fit_transform(housing)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,16512.0,16512.0,16512.0,16512.0,16354.0,16512.0,16512.0,16512.0
mean,0.475144,0.329481,0.541933,0.066744,0.083021,0.049657,0.081691,0.232671
std,0.199245,0.226492,0.246504,0.054436,0.064199,0.038539,0.061796,0.130554
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.253984,0.148778,0.333333,0.036853,0.045779,0.027378,0.045716,0.143058
50%,0.580677,0.182784,0.54902,0.054034,0.067039,0.040682,0.06693,0.209776
75%,0.631474,0.549416,0.705882,0.080014,0.100248,0.060218,0.098997,0.292444
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
from sklearn.preprocessing import StandardScaler

standart_scaler = StandardScaler()
standart_scaler.fit_transform(housing)

array([[ 1.13457498e+00, -9.24549614e-01, -1.56217152e+00, ...,
        -7.49814060e-01, -9.09507670e-01, -3.64750815e-01],
       [ 1.23455722e+00, -1.33745855e+00, -1.00534875e+00, ...,
        -2.65596783e-01,  5.94078093e-01, -2.58200211e-01],
       [ 1.08958298e+00, -8.16630231e-01,  2.67389007e-01, ...,
        -8.29759877e-01, -8.45638540e-01, -1.09335757e-01],
       ...,
       [ 1.22955811e+00, -1.33276641e+00,  2.87506773e-02, ...,
        -2.66678073e-02,  6.44965586e-02, -9.36233178e-04],
       [-1.26999787e+00,  8.35005534e-01,  5.06027337e-01, ...,
        -2.12905222e-01,  5.65129174e-02,  7.43069077e-01],
       [ 5.49678883e-01, -7.36863731e-01,  9.03757886e-01, ...,
         1.29589926e-01,  6.71577723e-02,  2.45199920e+00]])

In [43]:
pd.DataFrame(standart_scaler.fit_transform(housing))

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.134575,-0.924550,-1.562172,-0.768710,-0.887117,-0.749814,-0.909508,-0.364751
1,1.234557,-1.337459,-1.005349,-0.024404,0.568094,-0.265597,0.594078,-0.258200
2,1.089583,-0.816630,0.267389,-0.636950,-0.865361,-0.829760,-0.845639,-0.109336
3,-1.225006,0.924156,0.585573,-0.568733,-0.640553,-0.380973,-0.576856,-0.077904
4,-1.165017,0.783392,-1.005349,0.156884,0.875090,0.121414,0.791008,-0.683188
...,...,...,...,...,...,...,...,...
16507,-1.569945,1.304220,-0.209888,0.456849,0.478654,1.075313,0.548837,-0.410340
16508,0.749643,-0.877628,-0.209888,-0.286523,0.036289,-0.688038,0.032562,-0.846738
16509,1.229558,-1.332766,0.028751,0.110627,-0.004805,-0.026668,0.064497,-0.000936
16510,-1.269998,0.835006,0.506027,-0.084677,-0.007222,-0.212905,0.056513,0.743069


In [44]:
pd.DataFrame(standart_scaler.fit_transform(housing)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,16512.0,16512.0,16512.0,16512.0,16354.0,16512.0,16512.0,16512.0
mean,-1.122487e-15,1.300639e-16,-3.5824060000000004e-17,6.228868e-17,1.061752e-17,-7.611267e-17,-3.152087e-17,1.934284e-16
std,1.00003,1.00003,1.00003,1.00003,1.000031,1.00003,1.00003,1.00003
min,-2.3848,-1.454762,-2.19854,-1.226134,-1.293222,-1.28854,-1.321996,-1.782238
25%,-1.110026,-0.7978616,-0.8462565,-0.5491093,-0.5801206,-0.5781122,-0.5821784,-0.6864232
50%,0.5296824,-0.6477129,0.02875068,-0.2334917,-0.2489514,-0.2328917,-0.2388818,-0.1753686
75%,0.7846371,0.9710778,0.6651196,0.2437894,0.2683495,0.2740375,0.2800549,0.45786
max,2.634309,2.960548,1.858311,17.14464,14.28382,24.66024,14.86084,5.877678
