In [None]:
import pandas as pd
from tqdm import tqdm
import plotly.graph_objects as go

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

import torch as th
import torch.nn as nn
import torch.optim as optim

In [None]:
data = pd.read_csv('/kaggle/input/dl-contest/train.csv')
data.head()

Unnamed: 0,id,Тип,Подтверждено риэлтором,Количество розеток,Долгота,Количество зеркал,Цвет стен,Ремонтируется,Перепродажа,Готово к сдаче,Кол-во комнат,Площадь (фут²),Адрес,Широта,Опубликовано,Цена
0,0,BHK,0,7,12.96991,0,Зеленый,0,1,1,3,1510.269835,"Amruthnagar,Bangalore",77.59796,Owner,75.0
1,1,BHK,0,1,21.158795,0,Золотистый,0,1,1,3,1365.103003,"Shankar Nagar,Amravati",77.317542,Owner,55.0
2,2,BHK,0,1,30.689021,0,Черный,0,1,1,2,1200.0,"Mdc Sector-4,Panchkula",76.860403,Dealer,79.5
3,3,BHK,0,6,28.385415,0,Серый,0,1,1,3,1785.469029,"Sector-82A Gurgaon,Gurgaon",76.964613,Dealer,130.0
4,4,BHK,0,4,18.972317,2,Фиолетовый,0,1,1,3,2500.0,"Breach Candy,Lalitpur",72.806327,Dealer,2550.0


In [None]:
data['Город'] = data['Адрес'].str.replace(',,', ',').str.split(',').str[1].str.lower()
data['Район'] = data['Адрес'].str.replace(',,', ',').str.split(',').str[0].str.lower()
data = data.drop(columns=['Адрес'])
data = data.drop(columns=['id', 'Готово к сдаче', 'Цвет стен',
                          'Тип', 'Количество розеток', 'Количество зеркал'])

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Цена']), data['Цена'],
                                                    train_size=0.99, random_state=444)
train = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)

In [None]:
Q1 = train['Цена'].quantile(0.25)
Q3 = train['Цена'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

train = train[(train['Цена'] >= lower_bound) & (train['Цена'] <= upper_bound)]
X_train, y_train = train.drop(columns='Цена'), train['Цена']

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore')

tr = ColumnTransformer(
    [
        ("Район", encoder, ["Район"]),
        ("Опубликовано", encoder, ["Опубликовано"]),
        ("Город", encoder, ["Город"])
    ],
    remainder=RobustScaler()
).fit(X_train)

In [None]:
X_train = tr.transform(X_train)
X_test = tr.transform(X_test)

X_train = th.tensor(X_train.toarray(), dtype=th.float32)
X_test = th.tensor(X_test.toarray(), dtype=th.float32)

y_train = th.tensor(y_train.values, dtype=th.float32)
y_test = th.tensor(y_test.values, dtype=th.float32)

X_train.shape, X_test.shape

(torch.Size([19509, 5160]), torch.Size([220, 5160]))

In [None]:
th.manual_seed(444)
lr = 0.0005
epochs = 2843

model = nn.Sequential(

        nn.Linear(5160, 50),
        nn.ReLU(),
        nn.BatchNorm1d(num_features=50),
        nn.Dropout(p=0.1),

        nn.Linear(50, 20),
        nn.ReLU(),
        nn.BatchNorm1d(num_features=20),
        nn.Dropout(p=0.1),

        nn.Linear(20, 1),
)

criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr)

train_losses = []
test_losses = []
best = [1000000, 0]

for epoch in tqdm(range(epochs + 1)):

    model.train()
    train_pred = model(X_train)
    train_loss = criterion(train_pred.flatten(), y_train)
    train_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    train_mape = mean_absolute_percentage_error(y_train, train_pred.detach().flatten()) * 100
    train_losses.append(train_mape)

    model.eval()
    test_pred = model(X_test)
    test_mape = mean_absolute_percentage_error(y_test, test_pred.detach().flatten()) * 100
    test_losses.append(test_mape)

    if test_mape < best[0]:
        best = [test_mape, epoch]

print(f'BEST MAPE {round(best[0], 5)} on epoch {best[1]}')

fig = go.Figure([go.Scatter(name = 'Train Loss', x = th.arange(epochs + 1), y = train_losses),
                 go.Scatter(name = 'Test Loss', x = th.arange(epochs + 1), y = test_losses)])
fig.show()

100%|██████████| 2844/2844 [15:44<00:00,  3.01it/s]


BEST MAPE 18.17532 on epoch 2843


In [None]:
test = pd.read_csv('/kaggle/input/dl-contest/test.csv')
test['Город'] = test['Адрес'].str.replace(',,', ',').str.split(',').str[1].str.lower()
test['Район'] = test['Адрес'].str.replace(',,', ',').str.split(',').str[0].str.lower()

test = test.drop(columns=['Адрес'])
test = test.drop(columns=['id', 'Готово к сдаче', 'Цвет стен', 'Тип',
                          'Количество розеток', 'Количество зеркал'])
test = tr.transform(test)
X_test_normalized = th.tensor(test.toarray(), dtype=th.float32)

model.eval()
test_pred = model(X_test_normalized)
data = [{"id": i, "Цена": value.item()} for i, value in enumerate(test_pred)]
df = pd.DataFrame(data)
df.to_csv('/kaggle/working/submission.csv', index=False, encoding='utf-8')
df.head()

Unnamed: 0,id,Цена
0,0,22.361012
1,1,172.62149
2,2,32.483524
3,3,29.677944
4,4,93.7285
