<a href="https://colab.research.google.com/github/azizbekb/Portfolio/blob/main/05_ml_05_amaliyot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![Imgur](https://i.imgur.com/5pXzCIu.png)

# Data Science va Sun'iy Intellekt Praktikum

## 5-MODUL. Machine Learning

### Portfolio uchun vazifa: Toshkent shahrida uylarning narxini aniqlash.

Ushbu amaliyotda sizning vazifangiz berilgan ma`lumotlar asosida Toshkent shahridagi uylarning narxini aniqlash.

In [None]:
import pandas as pd
import numpy as np
import sklearn
df = pd.read_csv('https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv')
df.head()

Unnamed: 0,location,district,rooms,size,level,max_levels,price
0,"город Ташкент, Юнусабадский район, Юнусабад 8-...",Юнусабадский,3,57,4,4,52000
1,"город Ташкент, Яккасарайский район, 1-й тупик ...",Яккасарайский,2,52,4,5,56000
2,"город Ташкент, Чиланзарский район, Чиланзар 2-...",Чиланзарский,2,42,4,4,37000
3,"город Ташкент, Чиланзарский район, Чиланзар 9-...",Чиланзарский,3,65,1,4,49500
4,"город Ташкент, Чиланзарский район, площадь Актепа",Чиланзарский,3,70,3,5,55000


# Ustunlar ta'rifi
- `location` - sotilayotgan uy manzili
- `district` - uy joylashgan tuman
- `rooms` - xonalar soni
- `size` - uy maydoni (kv.m)
- `level` - uy joylashgan qavat
- `max_levels` - ja'mi qavatlar soni
- `price` - uy narxi

## Vazifani CRSIP-DM Metolodgiyasi yordamida bajaring.
<img src="https://i.imgur.com/dzZnnYi.png" alt="CRISP-DM" width="800"/>

In [None]:
df.loc[df['size'] == 'Площадьземли:1сот', 'size'] = 100
df.loc[df['price'] == 'Договорная', 'price'] = None
df['size'] = df['size'].str.replace(',', '').astype(float)
df['price'] = df['price'].str.replace(',', '').astype(float)
df.fillna(0, inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7565 entries, 0 to 7564
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    7565 non-null   object 
 1   district    7565 non-null   object 
 2   rooms       7565 non-null   int64  
 3   size        7565 non-null   float64
 4   level       7565 non-null   int64  
 5   max_levels  7565 non-null   int64  
 6   price       7565 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 413.8+ KB


In [None]:
df.isnull().sum()

location      0
district      0
rooms         0
size          0
level         0
max_levels    0
price         0
dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set=train_test_split(df, test_size=0.2, random_state=42)

X_train=train_set.drop(['price'], axis=1)
y=train_set['price'].copy()

X_num=X_train.drop(['location', 'district'], axis=1)

In [None]:
rooms_ix = X_num.columns.get_loc('rooms')
size_ix = X_num.columns.get_loc('size')
level_ix = X_num.columns.get_loc('level')
max_levels_ix = X_num.columns.get_loc('max_levels')

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, size_ix, level_ix, max_levels=2,3,4,5

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self):
    self
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    num_houses_per_level = X[:, level_ix-1] * X[:, rooms_ix-1]
    num_houses_max_levels = num_houses_per_level * X[:, max_levels_ix-1]
    return np.c_[X, num_houses_per_level, num_houses_max_levels]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder()),
          ('std_scaler', StandardScaler())
])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['location', 'district']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs)
])

In [None]:
X_prepared = full_pipeline.fit_transform(X_train)

In [None]:
X_prepared.toarray()[0:5,:]

array([[-0.57746118, -0.03809286, -0.31266907, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.25131698,  0.0679467 , -0.75864437, ...,  0.        ,
         0.        ,  0.        ],
       [-1.49185026, -0.04830408, -0.75864437, ...,  1.        ,
         0.        ,  0.        ],
       [-0.57746118, -0.02081234, -0.31266907, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.25131698,  0.05066618,  0.13330623, ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared, y)

In [None]:
test_data=X_train.sample(5)
test_data

Unnamed: 0,location,district,rooms,size,level,max_levels
7223,"город Ташкент, Учтепинский район, Чиланзар-13",Учтепинский,3,65.0,1,5
4027,"город Ташкент, Мирабадский район, 1-й проезд К...",Мирабадский,4,121.0,6,10
2371,"город Ташкент, Сергелийский район, Курувчи",Сергелийский,3,90.0,3,9
3820,"город Ташкент, Мирзо-Улугбекский район, Карбасу",Мирзо-Улугбекский,3,41.0,5,5
7000,"город Ташкент, Шайхантахурский район, Гульабад",Шайхантахурский,4,90.0,3,4


In [None]:
test_label=y.loc[test_data.index]
test_label

7223     34200.0
4027    109989.0
2371     45000.0
3820     45000.0
7000     53000.0
Name: price, dtype: float64

In [None]:
test_data_prepared=full_pipeline.transform(test_data)
test_data_prepared.toarray()

array([[ 0.3369279 , -0.02788164, -1.20461968, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.25131698,  0.01610514,  1.02525684, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.3369279 , -0.00824469, -0.31266907, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.3369279 , -0.04673312,  0.57928154, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.25131698, -0.00824469, -0.31266907, ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
predicted_data=LR_model.predict(test_data_prepared)
predicted_data

array([ 38585.60436781, 138028.44304735,  64155.49216055,  44959.56163484,
        57177.06006416])

In [None]:
pd.DataFrame({'Predict':predicted_data, 'Real price':test_label})

Unnamed: 0,Predict,Real price
7223,38585.604368,34200.0
4027,138028.443047,109989.0
2371,64155.492161,45000.0
3820,44959.561635,45000.0
7000,57177.060064,53000.0


In [None]:
test_set

Unnamed: 0,location,district,rooms,size,level,max_levels,price
132,"город Ташкент, Чиланзарский район, Чиланзар 6-...",Чиланзарский,2,37.0,1,4,41000.0
3771,"город Ташкент, Сергелийский район, Сергели-I Я...",Сергелийский,1,39.0,5,5,21000.0
65,"город Ташкент, Учтепинский район, Чиланзар 15-...",Учтепинский,4,110.0,3,3,91000.0
7525,"город Ташкент, Шайхантахурский район, Алишера ...",Шайхантахурский,4,84.0,2,4,95000.0
6791,"город Ташкент, Мирабадский район, Чимкент",Мирабадский,4,100.0,3,4,88000.0
...,...,...,...,...,...,...,...
4834,"город Ташкент, Чиланзарский район, Чиланзар-9",Чиланзарский,2,50.0,2,5,36000.0
3125,"город Ташкент, Мирзо-Улугбекский район, ц-1 Бу...",Мирзо-Улугбекский,4,92.0,4,4,130000.0
347,"город Ташкент, Яшнободский район, Фергана Йули",Яшнободский,2,59.0,4,4,26500.0
6678,"город Ташкент, Сергелийский район, Массив серг...",Сергелийский,3,57.0,3,7,43000.0


In [None]:
X_test=test_set.drop('price', axis=1)
X_test

Unnamed: 0,location,district,rooms,size,level,max_levels
132,"город Ташкент, Чиланзарский район, Чиланзар 6-...",Чиланзарский,2,37.0,1,4
3771,"город Ташкент, Сергелийский район, Сергели-I Я...",Сергелийский,1,39.0,5,5
65,"город Ташкент, Учтепинский район, Чиланзар 15-...",Учтепинский,4,110.0,3,3
7525,"город Ташкент, Шайхантахурский район, Алишера ...",Шайхантахурский,4,84.0,2,4
6791,"город Ташкент, Мирабадский район, Чимкент",Мирабадский,4,100.0,3,4
...,...,...,...,...,...,...
4834,"город Ташкент, Чиланзарский район, Чиланзар-9",Чиланзарский,2,50.0,2,5
3125,"город Ташкент, Мирзо-Улугбекский район, ц-1 Бу...",Мирзо-Улугбекский,4,92.0,4,4
347,"город Ташкент, Яшнободский район, Фергана Йули",Яшнободский,2,59.0,4,4
6678,"город Ташкент, Сергелийский район, Массив серг...",Сергелийский,3,57.0,3,7


In [None]:
y_test=test_set['price'].copy()
y_test

132      41000.0
3771     21000.0
65       91000.0
7525     95000.0
6791     88000.0
          ...   
4834     36000.0
3125    130000.0
347      26500.0
6678     43000.0
333      40000.0
Name: price, Length: 1513, dtype: float64

In [None]:
X_test_prepared = full_pipeline.transform(X_test)

In [None]:
y_predicted = LR_model.predict(X_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse=mean_squared_error(y_test, y_predicted)

lin_rmse=np.sqrt(lin_mse)
print(lin_rmse)

403906.69749612117


In [None]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

In [None]:
y_predicted = Tree_model.predict(X_test_prepared)

In [None]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

222636.74925989594


In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [None]:
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

227523.45836325985


In [None]:
import joblib

filename = 'RF_model.jbl'
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [None]:
model = joblib.load(filename)

In [None]:
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']