<a href="https://colab.research.google.com/github/asliddin2002developer/ML-Projects/blob/main/ML_04_California_Housing_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())             
])

In [4]:
num_pipeline.fit_transform(X_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429,  1.69520292]])

In [5]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)                                   
])

In [6]:
X_prepared = full_pipeline.fit_transform(X_train)

In [7]:
X_prepared[0:5, :]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

### linear regression

In [8]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [9]:
LR_model.fit(X_prepared, y)

LinearRegression()

In [10]:
test_data  = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
9563,-120.62,37.36,15.0,3455.0,729.0,2014.0,659.0,3.2656,INLAND
4392,-118.27,34.08,38.0,2265.0,801.0,2899.0,792.0,2.5521,<1H OCEAN
20179,-119.22,34.34,29.0,3128.0,672.0,1815.0,648.0,2.9821,NEAR OCEAN
1305,-121.83,38.0,25.0,1710.0,288.0,799.0,259.0,4.8359,INLAND
18436,-121.84,37.25,17.0,2363.0,473.0,1369.0,442.0,4.8355,<1H OCEAN
5897,-118.32,34.16,46.0,2345.0,453.0,1031.0,427.0,4.3173,<1H OCEAN
1479,-122.04,37.96,20.0,1143.0,346.0,578.0,298.0,2.2411,NEAR BAY
10044,-120.83,39.27,14.0,3338.0,608.0,1373.0,562.0,3.67,INLAND
19633,-120.89,37.54,30.0,509.0,115.0,275.0,115.0,2.2679,INLAND
6819,-118.09,34.09,40.0,855.0,208.0,745.0,222.0,3.0125,<1H OCEAN


In [11]:
test_label = y.loc[test_data.index]
test_label

9563      83500.0
4392     157500.0
20179    175700.0
1305     145300.0
18436    141600.0
5897     278300.0
1479     151800.0
10044    160100.0
19633    250000.0
6819     224000.0
Name: median_house_value, dtype: float64

In [12]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[-5.17407891e-01,  8.03543151e-01, -1.07984112e+00,
         3.73862945e-01,  4.54667522e-01,  5.16742001e-01,
         4.17404905e-01, -3.23045134e-01, -8.06110264e-02,
        -3.52514529e-03, -3.20625111e-02,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 6.54315498e-01, -7.31605724e-01,  7.45248959e-01,
        -1.73368940e-01,  6.26507525e-01,  1.29509109e+00,
         7.66526196e-01, -6.97736036e-01, -1.07878513e+00,
         4.86589388e-02,  2.42758381e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 1.80640085e-01, -6.09917093e-01,  3.10832770e-02,
         2.23489141e-01,  3.18627520e-01,  3.41723391e-01,
         3.88530212e-01, -4.71923719e-01, -2.54711963e-01,
        -2.55679053e-02,  3.40690007e-02,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [-1.12072079e+00,  1.10308440e

In [13]:
predicted_labels = LR_model.predict(test_data_prepared)
predicted_labels

array([122682.63084591, 184236.89562056, 210441.20843722, 204934.88542118,
       256433.95926625, 260883.89390349, 170523.76474451, 101443.81536608,
       100252.62991543, 187654.17874657])

In [14]:

pd.DataFrame({'Bashorat': predicted_labels, 'Asl qiymat': test_label})

Unnamed: 0,Bashorat,Asl qiymat
9563,122682.630846,83500.0
4392,184236.895621,157500.0
20179,210441.208437,175700.0
1305,204934.885421,145300.0
18436,256433.959266,141600.0
5897,260883.893903,278300.0
1479,170523.764745,151800.0
10044,101443.815366,160100.0
19633,100252.629915,250000.0
6819,187654.178747,224000.0


# Modelni Baholash

In [15]:
X_test = test_set.drop('median_house_value', axis=1)

In [16]:
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [17]:
y_test = test_set['median_house_value']
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [18]:
X_test_prepared = full_pipeline.transform(X_test)

In [19]:
y_predicted = LR_model.predict(X_test_prepared)
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [20]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_predicted, y_test)
mae

50898.7395349408

In [21]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_predicted, y_test)

print('RMSE=', np.sqrt(rmse))

RMSE= 72701.32600762138



# Random Forest

In [22]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)


RandomForestRegressor()

In [23]:
y_predicted = RF_model.predict(X_test_prepared)

In [25]:
from sklearn.metrics import mean_squared_error

mae = mean_squared_error(y_test, y_predicted)
print(np.sqrt(mae))

50361.493081812354


#Cross Validation

In [28]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value'].copy()
X_prepared = full_pipeline.transform(X)

In [29]:
from sklearn.model_selection import cross_val_score
mse_scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)


In [31]:
def display_scores(scores):
  print("Scores:", scores)
  print("Mean_Scores:", np.mean(scores))
  print("Std_dev:", scores.std())

In [34]:
display_scores(np.sqrt(-mse_scores))

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean_Scores: 73289.27323295093
Std_dev: 3694.7136787223626


In [35]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
RF_mse_scores = np.sqrt(-scores)
display_scores(RF_mse_scores)

Scores: [98124.73348636 47730.79410872 65404.59441845 56659.42144779
 60990.09865609 60197.96111158 47314.85129683 79482.63508289
 74318.1920762  49388.38733408]
Mean_Scores: 63961.16690190167
Std_dev: 15318.654932143216


In [1]:
import pickle

filename = 'RF_model.pkl' # faylga istalgan nom beramiz
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

NameError: ignored

In [37]:
with open(filename, 'rb') as file:
    model = pickle.load(file)

In [38]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [76752.85810907 63837.83854487 61083.90855108 80695.43352969
 62103.62373192]
Mean_Scores: 68894.73249332576
Std_dev: 8169.540848961437
