In [1]:
import pandas as pd
import math
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

def get_rtc(arr):
    cnt = {}
    res, mx = '', 0
    for el in arr:
        cnt[el] = cnt.get(el, 0) + 1
        if cnt[el] > mx:
            mx = cnt[el]
            res = el
    if res == 'good' or res == 'excellent':
        return 'хорошее'
    elif res == 'medium':
        return 'среднее'
    elif res == 'bad':
        return 'требует ремонта'
    return 'черновая отделка'

conditions = {
    'terrible' : math.e,
    'bad' : math.e ** 2,
    'medium' : math.e ** 6,
    'good' : math.e ** 10, 
    'excellent': math.e ** 13
}

In [2]:
df = pd.read_csv('almaty.csv')
idf = pd.read_csv('image_results.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,kv_id,price,room_cnt,is_72025,is_mortgage,is_installment,is_pledged,...,is_secure,is_coded_lock,is_window_bars,is_video_intercom,is_alarm,is_door_man,floor_lvl,floor_cnt,is_rough,is_new_building
0,25018,71.530915,51.121582,1274212000.0,14500000.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0
1,25019,76.924588,43.246318,1269498000.0,60000000.0,3.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,13.0,0.0,0.0
2,25020,76.883969,43.217514,1275106000.0,77000000.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,4.0,1.0,1.0
3,25021,76.932299,43.255219,1273420000.0,67125000.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,12.0,1.0,1.0
4,25022,76.774659,43.214749,1278340000.0,25500000.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,10.0,0.0,0.0


In [52]:
data = {}

for _, row in df.iterrows():
    rooms = idf[idf['kv_id'] == row['kv_id']]
    if row['kv_id'] not in data.keys():
        data[row['kv_id']] = []
    for _, el in rooms.iterrows():
        data[row['kv_id']].append(el['condition'])

In [53]:
for _, row in df.iterrows():
    rtc = row['renovation_type_code']
    idx = row['kv_id']
    if not isinstance(rtc, float) or idx not in data.keys():
        continue
    df.at[df[df['kv_id'] == idx].index[0], 'renovation_type_code'] = get_rtc(data[idx])

In [54]:
df['condition'] = 0
for _, row in idf.iterrows():
    try:
        k = conditions[row['condition']] * row['condition_percentage'] / 100.0
        df.at[df[df['kv_id'] == row['kv_id']].index[0], 'condition'] += k
    except:
        pass

In [55]:
cols = ['toilet_type_code', 'door_type_code', 'internet_type_code', 'parking_type_code', 'floor_type_code', 'building_type_code', 'renovation_type_code', 'balcony_type_code', 'furniture_type_code', 'phone_type_code', 'district_code', 'town_code', 'area_code']
df['is_72025'].fillna(0.0, inplace=True)
df['is_mortgage'].fillna(0.0, inplace=True)
df['is_installment'].fillna(0.0, inplace=True)
df['is_pledged'].fillna(0.0, inplace=True)
df['celling_height'].fillna(df['celling_height'].mode()[0], inplace=True)
df['is_priv_hostel'].fillna(0.0, inplace=True)
df['built_year'].fillna(df['built_year'].median(), inplace=True)
df = df.drop(['microdistrict_code'], axis=1)
df['kitchen_square_m'].fillna(df['kitchen_square_m'].mode()[0], inplace=True)
df['floor_lvl'].fillna(df['floor_lvl'].median(), inplace=True)
df['floor_cnt'].fillna(df['floor_cnt'].median(), inplace=True)
df['condition'] = (df['condition'] - df['condition'].mean()) / df['condition'].std()

for col in cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21738 entries, 0 to 21737
Data columns (total 42 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            21738 non-null  int64  
 1   longitude             21738 non-null  float64
 2   latitude              21738 non-null  float64
 3   kv_id                 21738 non-null  float64
 4   price                 21738 non-null  float64
 5   room_cnt              21738 non-null  float64
 6   is_72025              21738 non-null  float64
 7   is_mortgage           21738 non-null  float64
 8   is_installment        21738 non-null  float64
 9   is_pledged            21738 non-null  float64
 10  toilet_type_code      21738 non-null  object 
 11  door_type_code        21738 non-null  object 
 12  internet_type_code    21738 non-null  object 
 13  parking_type_code     21738 non-null  object 
 14  floor_type_code       21738 non-null  object 
 15  celling_height     

In [57]:
X = df.drop(['price', 'kv_id', 'Unnamed: 0'], axis=1)
y = df['price']

In [58]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_X = X.drop(cols, axis=1)
train_cols = pd.DataFrame(OH_encoder.fit_transform(X[cols]))
train_cols.index = X.index
OH_X = pd.concat([OH_X, train_cols], axis=1)



In [59]:
model = XGBRegressor(n_estimators=3000, learning_rate=0.05)
scores = cross_val_score(model, OH_X, y, cv=4, scoring='neg_mean_absolute_percentage_error')

In [60]:
print(scores)
print((-100 * scores).mean())

[-0.10096834 -0.0876288  -0.08909642 -0.08721171]
9.122631761978628
