# Importing libraries

In [None]:
# !pip install xgboost
# !pip install category_encoders

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, norm
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
from sklearn.linear_model import Lasso, RidgeCV, LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from category_encoders import TargetEncoder
from lightgbm import LGBMRegressor

#import h2o
#h2o.init()
#from h2o.estimators import H2OTargetEncoderEstimator

plt.style.use('ggplot')

# Custom metric function

In [None]:
import typing
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

# Importing dataset

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train.csv')
train.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,osm_catering_points_in_0.001,osm_catering_points_in_0.005,osm_catering_points_in_0.0075,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_name,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.001,osm_crossing_points_in_0.005,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.01,osm_culture_points_in_0.001,osm_culture_points_in_0.005,osm_culture_points_in_0.0075,osm_culture_points_in_0.01,osm_finance_points_in_0.001,osm_finance_points_in_0.005,osm_finance_points_in_0.0075,osm_finance_points_in_0.01,osm_healthcare_points_in_0.005,osm_healthcare_points_in_0.0075,osm_healthcare_points_in_0.01,osm_historic_points_in_0.005,osm_historic_points_in_0.0075,osm_historic_points_in_0.01,osm_hotels_points_in_0.005,osm_hotels_points_in_0.0075,osm_hotels_points_in_0.01,osm_leisure_points_in_0.005,osm_leisure_points_in_0.0075,osm_leisure_points_in_0.01,osm_offices_points_in_0.001,osm_offices_points_in_0.005,osm_offices_points_in_0.0075,osm_offices_points_in_0.01,osm_shops_points_in_0.001,osm_shops_points_in_0.005,osm_shops_points_in_0.0075,osm_shops_points_in_0.01,osm_subway_closest_dist,osm_train_stop_closest_dist,osm_train_stop_points_in_0.005,osm_train_stop_points_in_0.0075,osm_train_stop_points_in_0.01,osm_transport_stop_closest_dist,osm_transport_stop_points_in_0.005,osm_transport_stop_points_in_0.0075,osm_transport_stop_points_in_0.01,per_square_meter_price,reform_count_of_houses_1000,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type,price_type
0,Пермь,,COL_0,57.998207,56.292797,4,19,35,52,0,0,0,0,0,2,4,6,3.29347,Пермь,1055397.0,0.027732,3,6,17,34,0,0,1,1,0,0,1,2,2,3,4,0,0,1,0,0,0,0,1,2,0,1,2,4,4,11,20,28,269.024986,3.368385,0,0,0,0.002864,4,13,21,139937.5,136,49,2503.0,765.0,5.762963,5.530612,1964.118519,1960.959184,Пермский край,32.0,S27289,2020-01-05,10,0
1,Шатура,,COL_1,55.574284,39.543835,3,24,37,59,0,0,0,1,0,2,2,6,43.950989,Орехово-Зуево,120184.0,0.089441,0,31,50,57,0,1,2,3,0,0,1,2,1,1,3,2,4,6,2,2,2,1,1,2,0,1,2,3,1,12,20,29,102.455451,1.4766,0,0,0,0.154661,4,10,11,60410.714286,146,37,1336.0,514.0,2.894366,3.527778,1952.321678,1957.222222,Московская область,280.0,S17052,2020-01-05,10,0
2,Ярославль,,COL_2,57.61914,39.850525,1,30,67,128,0,0,1,1,0,3,6,11,2.676293,Ярославль,603961.0,0.200995,0,15,29,53,0,1,2,2,0,0,5,9,0,1,3,0,0,0,0,0,1,2,3,6,0,1,6,9,1,16,37,80,243.361937,1.455127,0,0,0,0.118275,9,13,21,45164.761264,105,27,1883.0,573.0,6.141414,7.222222,1968.15,1973.37037,Ярославская область,297.4,S16913,2020-01-05,110,0
3,Новокузнецк,,COL_3,53.897083,87.108604,0,0,5,21,0,0,0,1,0,0,1,4,15.618563,Новокузнецк,552105.0,0.8614,0,0,0,5,0,0,0,0,0,0,0,1,0,0,3,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,4,11,295.633502,9.400855,0,0,0,1.036523,0,0,3,28805.263158,75,2,1801.0,54.0,8.581081,9.0,1992.716216,2014.0,Кемеровская область,190.0,S10148,2020-01-05,110,0
4,Москва,,COL_4,55.80259,37.48711,1,23,64,153,0,1,1,1,0,8,14,26,9.995325,Химки,232066.0,0.236744,0,14,40,78,0,0,0,0,0,0,3,9,2,7,14,0,2,2,1,1,3,3,4,9,0,8,8,12,0,8,33,89,1.110429,1.235503,0,0,0,0.235032,10,32,62,13222.591362,144,38,3090.0,619.0,7.263889,5.684211,1963.229167,1960.5,Москва,60.2,S1338,2020-01-05,10,0


In [None]:
test = pd.read_csv('/content/drive/MyDrive/test.csv')
test.head()

Unnamed: 0,city,floor,id,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,osm_catering_points_in_0.001,osm_catering_points_in_0.005,osm_catering_points_in_0.0075,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_name,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.001,osm_crossing_points_in_0.005,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.01,osm_culture_points_in_0.001,osm_culture_points_in_0.005,osm_culture_points_in_0.0075,osm_culture_points_in_0.01,osm_finance_points_in_0.001,osm_finance_points_in_0.005,osm_finance_points_in_0.0075,osm_finance_points_in_0.01,osm_healthcare_points_in_0.005,osm_healthcare_points_in_0.0075,osm_healthcare_points_in_0.01,osm_historic_points_in_0.005,osm_historic_points_in_0.0075,osm_historic_points_in_0.01,osm_hotels_points_in_0.005,osm_hotels_points_in_0.0075,osm_hotels_points_in_0.01,osm_leisure_points_in_0.005,osm_leisure_points_in_0.0075,osm_leisure_points_in_0.01,osm_offices_points_in_0.001,osm_offices_points_in_0.005,osm_offices_points_in_0.0075,osm_offices_points_in_0.01,osm_shops_points_in_0.001,osm_shops_points_in_0.005,osm_shops_points_in_0.0075,osm_shops_points_in_0.01,osm_subway_closest_dist,osm_train_stop_closest_dist,osm_train_stop_points_in_0.005,osm_train_stop_points_in_0.0075,osm_train_stop_points_in_0.01,osm_transport_stop_closest_dist,osm_transport_stop_points_in_0.005,osm_transport_stop_points_in_0.0075,osm_transport_stop_points_in_0.01,reform_count_of_houses_1000,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,lat,lng,total_square,street,date,realty_type,price_type
0,Курск,1.0,COL_289284,7,55,85,117,0,0,0,0,0,2,6,10,4.101661,Курск,443212.0,0.147072,0,7,7,19,0,0,0,0,1,2,3,3,5,5,5,2,2,4,7,7,8,0,0,1,0,0,0,1,5,36,61,83,435.363535,5.636811,0,0,0,0.108668,11,15,27,184,78,1997.0,743.0,4.325,4.211268,1966.471591,1966.74026,Курская область,51.709255,36.147908,156.148996,S6983,2020-09-06,100,1
1,Сургут,1.0,COL_289305,8,70,112,140,0,0,0,0,0,7,8,13,4.084249,Сургут,380632.0,0.16248,0,18,33,50,0,0,0,1,1,2,5,6,3,6,8,1,1,1,3,5,5,3,6,7,0,3,5,8,4,51,81,99,693.447931,9.641591,0,0,0,0.091416,9,11,19,118,44,2823.0,1019.0,5.389831,5.5,1988.259259,1989.068182,Ханты-Мансийский АО,61.23324,73.462509,190.737943,S29120,2020-09-06,110,1
2,Тюмень,-1.0,COL_289318,3,28,67,122,0,0,0,0,0,9,18,34,1.291974,Тюмень,744554.0,0.075807,1,21,57,92,0,0,1,2,0,1,4,5,2,3,3,1,2,5,0,1,1,0,2,5,0,1,1,3,3,15,34,63,298.904686,1.803999,0,0,0,0.120308,6,20,29,150,51,3484.0,1332.0,7.915493,8.25,1985.880282,1991.458333,Тюменская область,57.14311,65.554573,457.118051,S23731,2020-09-06,10,1
3,Иркутск,1.0,COL_289354,5,76,139,231,0,0,0,0,0,28,52,84,0.914598,Иркутск,623562.0,0.043604,1,30,60,95,1,8,13,18,0,5,6,10,5,8,12,12,20,32,1,8,18,1,5,5,0,7,11,16,3,14,26,51,60.224709,1.338052,0,0,0,0.079332,9,18,29,252,86,1975.0,666.0,3.27686,3.012048,1947.073276,1941.657895,Иркутская область,52.28138,104.282975,66.503622,S14207,2020-09-06,100,1
4,Курск,,COL_289399,8,105,189,279,0,0,2,9,1,24,40,54,1.300135,Курск,443212.0,0.084204,1,40,72,84,1,10,12,12,3,10,14,21,2,6,6,10,14,17,1,1,1,4,5,12,0,2,12,16,1,49,98,156,432.530106,2.440972,0,0,0,0.093642,12,26,37,114,31,1044.0,394.0,4.346154,4.827586,1948.764151,1946.689655,Курская область,51.729706,36.194019,23.864915,S20658,2020-09-06,10,1


# Encoding

In [None]:
#replacing in 'region' Алтай to Алтайский край 
train['region'].replace('Алтай', 'Алтайский край', inplace=True) 
test['region'].replace('Алтай', 'Алтайский край', inplace=True)

In [None]:
# target encoding 'region', 'osm_city_nearest_name'

encoder = TargetEncoder()
train['region_encoded'] = encoder.fit_transform(train['region'], train['per_square_meter_price'])
test['region_encoded'] = encoder.transform(test['region'])

encoder = TargetEncoder()
train['osm_city_nearest_name_encoded'] = encoder.fit_transform(train['osm_city_nearest_name'], train['per_square_meter_price'])
test['osm_city_nearest_name_encoded'] = encoder.transform(test['osm_city_nearest_name'])

# encoder = TargetEncoder()
# train['city_encoded'] = encoder.fit_transform(train['city'], train['per_square_meter_price'])
# test['city_encoded'] = encoder.transform(test['city'])

In [None]:
# concat train and test
y = train['per_square_meter_price'].to_numpy()

df = pd.concat([train, test]).reset_index(drop=True)
df.drop(['per_square_meter_price'], axis=1, inplace=True)

df.shape, df.index

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


((282766, 78), RangeIndex(start=0, stop=282766, step=1))

# Filling in missing values

In [None]:
# Создаем таблицу признаков, которые имеют пропущенные значения
df_nan = pd.DataFrame(df.loc[:, df.isna().any()].isna().sum().sort_values(ascending=False), columns=['NaN count'])
df_nan['NaN %'] =  df_nan.apply(lambda x: round(x * 100 / len(df), 3))
df_nan

Unnamed: 0,NaN count,NaN %
floor,177538,62.786
reform_mean_floor_count_500,30216,10.686
reform_mean_year_building_500,29685,10.498
reform_house_population_500,27282,9.648
reform_mean_floor_count_1000,16743,5.921
reform_mean_year_building_1000,16274,5.755
reform_house_population_1000,14631,5.174
street,1612,0.57
osm_city_nearest_population,55,0.019


In [None]:
missing = ['reform_mean_floor_count_500',
           'reform_mean_year_building_500', 
           'reform_house_population_500',
          'reform_mean_floor_count_1000',
          'reform_mean_year_building_1000',
          'reform_house_population_1000']

for feat in missing:
    df[feat] = df.groupby("lemma_city")[feat].transform(lambda x: x.fillna(x.median()))
    df[feat] = df[feat].fillna(df[feat].median())

df['osm_city_nearest_population'].fillna(df['osm_city_nearest_population'].min(), inplace=True)
df.drop(['street', 'id', 'date', 'lng', 'lat', 'city', 'floor', 'region', 'osm_city_nearest_name', 'lemma_city'], axis=1, inplace=True)

df['realty_type'] = df['realty_type'].astype('str')
df['realty_type'].dtype

df['price_type'] = df['price_type'].astype('str')
df['price_type'].dtype



dtype('O')

# Preprocessing 'floor' column

In [None]:
df['floor'] = df['floor'].fillna('777')

# наличие нескольких этажей (или нетипичных)
list_floor_A = ['подвал, 1', 'цоколь, 1', '1, подвал', '1-й, подвал', 'подвал,1', 
                'подвал, 1, 2, 3', '1, 2, 3, мансардный', '1-2, подвальный', 'цоколь, 1, 2,3,4,5,6', 
                '1,2,антресоль', '1, подвал, антресоль', 'подвал, 1-3', 
                'цокольный, 1,2', '1,2,3 этаж, подвал', 'подвал,1,2,3', '1,2,подвал ', '1-7', 'подвал, 1, 2', 
                '1, 2 этаж', '1,2,3, антресоль, технический этаж', '1,2,3,4, подвал', 
                ' 1, 2, антресоль', 'подвал, цоколь, 1 этаж', '1, антресоль', '1-3', '1, 2.', 'подвал , 1 ', 
                '1, 2', 'подвал, 1,2,3', '1 + подвал (без отделки)', '1-й, 2-й', '1 этаж, подвал', '1, цоколь', 
                'подвал, 1-7, техэтаж', '1, 2, 3', 'цоколь, 1,2(мансарда)', 'подвал, 1-4 этаж', 'подва, 1.2 этаж',
                '1.2', '1-3 этажи, цоколь (188,4 кв.м), подвал (104 кв.м)', ' 1-2, подвальный', 
                'подвал, 1 и 4 этаж', 'подвал,1,2,3,4,5', '1,2', '1,2', '1,2,3', '1,2,3,4', 
                '1,2,3,4,5', '1,2 ', '5, мансарда', '3, мансарда (4 эт)', 'подвал, 2', '2, 3, 4, тех.этаж', '3, 4',
               'технический этаж,5,6', '3 этаж, мансарда (4 этаж)', '2,3', '4, 5', '3 этаж, мансарда (4 этаж)',
               '3, мансарда', '3,4', '7,8', 'b', 'a', '3, мансарда', 'подвал, 3. 4 этаж', '2, 3']

# наличие 1 этажа 
list_floor_1 = ['1 (цокольный этаж)', 'фактически на уровне 1 этажа', '1 (по док-м цоколь)', '1 этаж', '1-й']

# замена строковых значений на int
list_floor_2 = {'тех.этаж (6)':6, 'техническое подполье':-1, '2-й':2, '5(мансарда)':5, 'подвал':-1, 
                'цоколь':-1, 'антресоль':2, 'мезонин':2, 'мансарда (4эт)':4, '3 (антресоль)':3, 
                'мансарда':2, '3 этаж':3, '4 этаж':4, '5 этаж':5,
                'цокольный':-1 }

df['floor'] = df['floor'].astype('str')
df['floor'] = df['floor'].apply(lambda x: x.lower())

df['floor'] = ['1' if i in list_floor_1 else i for i in df['floor']]
df['floor'] = ['9999' if i in list_floor_A else i for i in df['floor']]
df['floor'] = [str(list_floor_2[i]) if i in list_floor_2.keys() else i for i in df['floor']]

df['floor'] = pd.to_numeric(df['floor'])
df['floor'] = df['floor'].astype(int)

df[df['floor'].isin([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8])]['floor'].value_counts()#.mean()

med_floor = int(df[df['floor'].isin([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8])]['floor'].median())
df['floor'] = df['floor'].replace(9999, 1)

#df.loc[(df['floor'] != 1) & (df['floor'] != 7777), 'floor'] = 2
df['floor'] = df['floor'].astype('str')
df['floor'].unique()

array(['777', '3', '4', '-1', '1', '14', '2', '8', '-2', '6', '10', '5',
       '16', '19', '7', '9', '58', '24', '18', '26', '17', '48', '11',
       '-3', '15', '22', '60', '12', '21', '35', '28', '38', '39', '13',
       '81', '44', '82', '25', '45', '47', '23', '37', '29', '113', '78',
       '42', '69', '27', '46', '53', '80', '70', '76', '64', '30', '73',
       '77', '52', '67', '65', '20', '40', '49', '75', '93', '94', '91',
       '72', '79', '84', '92', '33', '66', '90', '31', '36', '61', '71',
       '68', '51', '97', '43', '95', '85', '50', '0', '62', '54', '74',
       '57', '41', '34', '59', '56', '123', '55', '83'], dtype=object)

In [None]:
train['floor'] = df['floor'][:len(y)]
test['floor'] = df['floor'][len(y):].reset_index(drop=True)

In [None]:
# target encoding 'floor'

encoder = TargetEncoder()
a = encoder.fit_transform(train['floor'], train['per_square_meter_price'])
b = encoder.transform(test['floor'])
df['floor_encoded'] = pd.concat([a, b]).reset_index(drop=True)

# Preprocessing 'city' column

In [None]:
# lemmatization of 'city'

city = pd.DataFrame(df['city'].value_counts())

from gensim.models import Word2Vec
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from nltk.tokenize import RegexpTokenizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


stop_nltk = stopwords.words('russian')
STOPWORDS = STOPWORDS.union(set(["don't", "i'm"]))

reg_tok = RegexpTokenizer('\w+')

def clean_and_lemmatize(text):
    #cleaning
    text = text.lower()
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+',' ',text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub('r<.*?>',' ', text)
    text = reg_tok.tokenize(text)
    
    
    # filtering
    text = " ".join([word for word in text if not word in STOPWORDS and len(word) > 2])
    
    # lemmatization
    text = WordNetLemmatizer().lemmatize(text)
    text = ' '.join(text.split())
    
    
    return text

df['lemma_city'] = df['city'].apply(clean_and_lemmatize)
df['lemma_city']

list_unique_city = list(df['lemma_city'])
list_unique_city

df[['city', 'lemma_city']].tail(100)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,city,lemma_city
282666,Самара,самара
282667,Иркутск,иркутск
282668,Ангарск,ангарск
282669,Нижний Новгород,нижний новгород
282670,Томск,томск
...,...,...
282761,Красноярск,красноярск
282762,Томск,томск
282763,Калуга,калуга
282764,Нижний Новгород,нижний новгород


In [None]:
train['lemma_city'] = df['lemma_city'][:len(y)]
test['lemma_city'] = df['lemma_city'][len(y):].reset_index(drop=True)

In [None]:
encoder = TargetEncoder()
a = encoder.fit_transform(train['lemma_city'], train['per_square_meter_price'])
b = encoder.transform(test['lemma_city'])
df['city_encoded'] = pd.concat([a, b]).reset_index(drop=True)

In [None]:
# train[train['price_type'] == 0]['per_square_meter_price'].describe()

In [None]:
# train[train['price_type'] == 1]['per_square_meter_price'].describe()

# Function to separate city to parts by its 'lng' and 'lat'

In [None]:
# lng_lat = df.groupby('city')[['lng', 'lat']].agg(['min', 'max'])
# lng_lat['city_lng'] = lng_lat['lng']['max'] - lng_lat['lng']['min']
# lng_lat['city_lat'] = lng_lat['lat']['max'] - lng_lat['lat']['min']
# df = pd.merge(df, lng_lat, on='city')
# df.rename(columns={('lng', 'min'): 'lng_min', ('lng', 'max'): 'lng_max', ('lat', 'min'): 'lat_min', ('lat', 'max'): 'lat_max', ('city_lng', ''): 'city_lng', ('city_lat', ''): 'city_lat'}, inplace = True)
# n=3

# def lng_index(x):
#     for lng, city in zip(df['lng_min'], df['city_lng']):
#         if (x >= lng) and (x <= (lng + city / n )):
#             return 1
#         elif (x >= lng + city / n ) and (x <= lng + city / n * (n-1)):
#             return 2
#         else:
#             return 3
# n=3

# def lat_index(x):
#     for lng, city in zip(df['lat_min'], df['city_lat']):
#         if (x >= lng) and (x <= (lng + city / n )):
#             return 1
#         elif (x >= lng + city / n ) and (x <= lng + city / n * (n-1)):
#             return 2
#         else:
#             return 3
# df['Ing_index'] = df['lng'].apply(lng_index)
# df['Iat_index'] = df['lat'].apply(lat_index)

In [None]:
# df_nan = pd.DataFrame(df.loc[:, df.isna().any()].isna().sum().sort_values(ascending=False), columns=['NaN count'])
# df_nan['NaN %'] =  df_nan.apply(lambda x: round(x * 100 / len(df), 3))
# df_nan

In [None]:
# df.groupby('city')['reform_house_population_1000'].median().isna().sum()

In [None]:
# df_nan = pd.DataFrame(df.loc[:, df.isna().any()].isna().sum().sort_values(ascending=False), columns=['NaN count'])
# df_nan['NaN %'] =  df_nan.apply(lambda x: round(x * 100 / len(df), 3))
# df_nan

In [None]:
# df.dtypes.unique(), len(df.columns)

In [None]:
# skewed_features = df[df.select_dtypes(include=['float64', 'int64']).columns].apply(lambda x: skew(x)).sort_values(ascending=False)
# pd.DataFrame({'Skew' :skewed_features}) 
# df[skewed_features[(skewed_features > 1) | (skewed_features < -1)].index] = np.log1p(df[skewed_features[(skewed_features > 1) | (skewed_features < -1)].index])

In [None]:
# skewed_features[(skewed_features > 1) | (skewed_features < -1)]

# One-Hot-Encoding

In [None]:
df_dummies = pd.get_dummies(df)

In [None]:
df_dummies.shape

(282766, 74)

In [None]:
df.columns

Index(['osm_amenity_points_in_0.001', 'osm_amenity_points_in_0.005',
       'osm_amenity_points_in_0.0075', 'osm_amenity_points_in_0.01',
       'osm_building_points_in_0.001', 'osm_building_points_in_0.005',
       'osm_building_points_in_0.0075', 'osm_building_points_in_0.01',
       'osm_catering_points_in_0.001', 'osm_catering_points_in_0.005',
       'osm_catering_points_in_0.0075', 'osm_catering_points_in_0.01',
       'osm_city_closest_dist', 'osm_city_nearest_name_encoded',
       'osm_city_nearest_population', 'osm_crossing_closest_dist',
       'osm_crossing_points_in_0.001', 'osm_crossing_points_in_0.005',
       'osm_crossing_points_in_0.0075', 'osm_crossing_points_in_0.01',
       'osm_culture_points_in_0.001', 'osm_culture_points_in_0.005',
       'osm_culture_points_in_0.0075', 'osm_culture_points_in_0.01',
       'osm_finance_points_in_0.001', 'osm_finance_points_in_0.005',
       'osm_finance_points_in_0.0075', 'osm_finance_points_in_0.01',
       'osm_healthcare_point

In [None]:
train_encode = df_dummies[:len(y)] #drop + save train pre-encoded columns
test_encode = df_dummies[len(y):] #drop + save test pre-encoded columns
train_encode.head()

Unnamed: 0,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,osm_catering_points_in_0.001,osm_catering_points_in_0.005,osm_catering_points_in_0.0075,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_name_encoded,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.001,osm_crossing_points_in_0.005,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.01,osm_culture_points_in_0.001,osm_culture_points_in_0.005,osm_culture_points_in_0.0075,osm_culture_points_in_0.01,osm_finance_points_in_0.001,osm_finance_points_in_0.005,osm_finance_points_in_0.0075,osm_finance_points_in_0.01,osm_healthcare_points_in_0.005,osm_healthcare_points_in_0.0075,osm_healthcare_points_in_0.01,osm_historic_points_in_0.005,osm_historic_points_in_0.0075,osm_historic_points_in_0.01,osm_hotels_points_in_0.005,osm_hotels_points_in_0.0075,osm_hotels_points_in_0.01,osm_leisure_points_in_0.005,osm_leisure_points_in_0.0075,osm_leisure_points_in_0.01,osm_offices_points_in_0.001,osm_offices_points_in_0.005,osm_offices_points_in_0.0075,osm_offices_points_in_0.01,osm_shops_points_in_0.001,osm_shops_points_in_0.005,osm_shops_points_in_0.0075,osm_shops_points_in_0.01,osm_subway_closest_dist,osm_train_stop_closest_dist,osm_train_stop_points_in_0.005,osm_train_stop_points_in_0.0075,osm_train_stop_points_in_0.01,osm_transport_stop_closest_dist,osm_transport_stop_points_in_0.005,osm_transport_stop_points_in_0.0075,osm_transport_stop_points_in_0.01,reform_count_of_houses_1000,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region_encoded,total_square,floor_encoded,city_encoded,price_type_0,price_type_1,realty_type_10,realty_type_100,realty_type_110
0,4,19,35,52,0,0,0,0,0,2,4,6,3.29347,51692.216738,1055397.0,0.027732,3,6,17,34,0,0,1,1,0,0,1,2,2,3,4,0,0,1,0,0,0,0,1,2,0,1,2,4,4,11,20,28,269.024986,3.368385,0,0,0,0.002864,4,13,21,136,49,2503.0,765.0,5.762963,5.530612,1964.118519,1960.959184,47830.246446,32.0,88034.154974,58350.711636,1,0,1,0,0
1,3,24,37,59,0,0,0,1,0,2,2,6,43.950989,42422.020437,120184.0,0.089441,0,31,50,57,0,1,2,3,0,0,1,2,1,1,3,2,4,6,2,2,2,1,1,2,0,1,2,3,1,12,20,29,102.455451,1.4766,0,0,0,0.154661,4,10,11,146,37,1336.0,514.0,2.894366,3.527778,1952.321678,1957.222222,82125.09678,280.0,88034.154974,45571.086844,1,0,1,0,0
2,1,30,67,128,0,0,1,1,0,3,6,11,2.676293,50752.546076,603961.0,0.200995,0,15,29,53,0,1,2,2,0,0,5,9,0,1,3,0,0,0,0,0,1,2,3,6,0,1,6,9,1,16,37,80,243.361937,1.455127,0,0,0,0.118275,9,13,21,105,27,1883.0,573.0,6.141414,7.222222,1968.15,1973.37037,48571.441864,297.4,88034.154974,54864.870571,1,0,0,0,1
3,0,0,5,21,0,0,0,1,0,0,1,4,15.618563,36456.831937,552105.0,0.8614,0,0,0,5,0,0,0,0,0,0,0,1,0,0,3,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,4,11,295.633502,9.400855,0,0,0,1.036523,0,0,3,75,2,1801.0,54.0,8.581081,9.0,1992.716216,2014.0,38238.030421,190.0,88034.154974,38278.427315,1,0,0,0,1
4,1,23,64,153,0,1,1,1,0,8,14,26,9.995325,211724.219546,232066.0,0.236744,0,14,40,78,0,0,0,0,0,0,3,9,2,7,14,0,2,2,1,1,3,3,4,9,0,8,8,12,0,8,33,89,1.110429,1.235503,0,0,0,0.235032,10,32,62,144,38,3090.0,619.0,7.263889,5.684211,1963.229167,1960.5,296631.087837,60.2,88034.154974,296617.454315,1,0,1,0,0


# Scaling data

In [None]:
scaled_data = StandardScaler() 
scaled_X_train = scaled_data.fit_transform(train_encode) #scale train data (1st half of full data)
scaled_X_test = scaled_data.transform(test_encode) #scale test data (2nd half of full data)

In [None]:
y_log = np.log1p(y)
y_log

array([11.84895832, 11.00893831, 10.71809458, ..., 10.48458096,
       10.56504185, 10.62948943])

# Train/ test split

In [None]:
train_X, test_X, train_y, test_y = train_test_split(scaled_X_train, y_log, random_state=1, test_size=0.4)
valid_X, test_X, valid_y, test_y = train_test_split(test_X, test_y, random_state=1, test_size=0.5)
print(train_X.shape, valid_X.shape, test_X.shape)
print(train_y.shape, valid_y.shape, test_y.shape)

(167875, 74) (55958, 74) (55959, 74)
(167875,) (55958,) (55959,)


# Auto ML

In [None]:
# !pip install flaml
# !pip install automl

In [None]:
import flaml
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score

automl = AutoML()
settings = {
    "time_budget": 120,
    "metric": 'mape',                                                     # ['mae','mse','r2']
    "estimator_list": ['lgbm', 'xgboost', 'catboost', 'rf'],                                          # список моделей
    "task": 'regression'}                           
automl.fit(X_train=train_X, y_train=train_y, X_val=valid_X, y_val=valid_y, **settings)              # 2 звездочки раскрывают словарь в settings

print('Лучшие гиперпараметры:', automl.best_config)
print('Лучшая метрика: {}'.format((automl.best_loss)))
print('Время на обучение: {} s'.format(automl.best_config_train_time))

automl.model                # инициализирует модель с лучшими гиперпараметрами получившимися после обучения
y_pred = automl.predict(test_X)
print('raif_metric', '=', (metrics_stat(y_pred, test_y)))  # можно поставить mae. R2 будет 
                                        # выводится так: 1 - sklearn_metric_loss_score('r2', y_pred, y_test)

[flaml.automl: 09-25 16:45:16] {1431} INFO - Evaluation method: holdout
[flaml.automl: 09-25 16:45:16] {1477} INFO - Minimizing error metric: mape
[flaml.automl: 09-25 16:45:16] {1514} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost', 'catboost', 'rf']
[flaml.automl: 09-25 16:45:16] {1746} INFO - iteration 0, current learner lgbm
[flaml.automl: 09-25 16:45:16] {1931} INFO -  at 0.4s,	best lgbm's error=0.0757,	best lgbm's error=0.0757
[flaml.automl: 09-25 16:45:16] {1746} INFO - iteration 1, current learner lgbm
[flaml.automl: 09-25 16:45:17] {1931} INFO -  at 0.6s,	best lgbm's error=0.0757,	best lgbm's error=0.0757
[flaml.automl: 09-25 16:45:17] {1746} INFO - iteration 2, current learner lgbm
[flaml.automl: 09-25 16:45:17] {1931} INFO -  at 0.9s,	best lgbm's error=0.0642,	best lgbm's error=0.0642
[flaml.automl: 09-25 16:45:17] {1746} INFO - iteration 3, current learner xgboost
[flaml.automl: 09-25 16:45:17] {1931} INFO -  at 1.1s,	best xgboost's error=0.6226,	best lgbm's er

Лучшие гиперпараметры: {'n_estimators': 112, 'num_leaves': 94, 'min_child_samples': 4, 'learning_rate': 0.09348689572544734, 'log_max_bin': 7, 'colsample_bytree': 0.5967846088487322, 'reg_alpha': 0.006958608037974516, 'reg_lambda': 0.001895876878997586, 'FLAML_sample_size': 167875}
Лучшая метрика: 0.04750939436003001
Время на обучение: 21.883225679397583 s
raif_metric = {'mape': 0.04459696339846211, 'mdape': 0.03061000769740277, 'rmse': 0.710851683628062, 'r2': 0.47121746842428136, 'raif_metric': 0.023312913617357624}


In [None]:
real_y_pred = automl.predict(scaled_X_test)
test['per_square_meter_price'] = np.expm1(real_y_pred)
final = test[['id','per_square_meter_price']].to_csv('sub3.csv', index=False)