In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from supplemental_english import REGION_CODES, GOVERNMENT_CODES
import sys
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [21]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,plate,date,price
0,1,X059CP797,2024-12-26 00:00:00,65000
1,2,Y800MH790,2024-07-12 21:31:37,100000
2,3,A212TX77,2024-04-18 00:00:00,290000
3,4,P001AY199,2025-01-03 00:27:15,680000
4,5,P001AY199,2025-01-10 09:32:41,750000


In [23]:
train.shape

(51635, 4)

In [25]:
train.isnull().sum()

id       0
plate    0
date     0
price    0
dtype: int64

In [27]:
train.duplicated().sum()

0

In [29]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,plate,date,price
0,51636,P700TT790,2025-01-27 00:00:00,
1,51637,M081TX797,2025-02-10 00:00:00,
2,51638,T333HX777,2025-02-11 00:00:00,
3,51639,H744BH977,2025-02-03 00:00:00,
4,51640,X066EM777,2025-02-12 00:00:00,


In [31]:
test.shape

(7695, 4)

In [33]:
test.isnull().sum()

id          0
plate       0
date        0
price    7695
dtype: int64

In [35]:
test.duplicated().sum()

0

In [37]:
file_path = os.path.abspath('supplemental_english.py') 
sys.path.append(file_path)

In [39]:
max_len = max(len(v) for v in REGION_CODES.values())

In [43]:
for k in REGION_CODES:
    while len(REGION_CODES[k]) < max_len:
        REGION_CODES[k].append(None)

In [45]:
region_codes_df = pd.DataFrame(REGION_CODES)
region_codes_df

Unnamed: 0,Republic of Adygea,Altai Republic,Republic of Bashkortostan,Republic of Buryatia,Republic of Dagestan,Donetsk People's Republic,Republic of Ingushetia,Kabardino-Balkarian Republic,Republic of Kalmykia,Karachay-Cherkess Republic,...,Moscow,Saint Petersburg,Sevastopol,Jewish Autonomous Oblast,Nenets Autonomous Okrug,Khanty-Mansi Autonomous Okrug,Chukotka Autonomous Okrug,Yamalo-Nenets Autonomous Okrug,Baikonur,Occupational Administration of Kharkiv Oblast
0,1.0,4.0,2.0,3.0,5.0,80.0,6.0,7.0,8.0,9.0,...,77,78.0,92.0,79.0,83.0,86.0,87.0,89.0,94.0,188.0
1,,,102.0,,,180.0,,,,,...,97,98.0,,,,186.0,,,,
2,,,702.0,,,,,,,,...,99,178.0,,,,,,,,
3,,,,,,,,,,,...,177,198.0,,,,,,,,
4,,,,,,,,,,,...,197,,,,,,,,,
5,,,,,,,,,,,...,199,,,,,,,,,
6,,,,,,,,,,,...,777,,,,,,,,,
7,,,,,,,,,,,...,797,,,,,,,,,
8,,,,,,,,,,,...,799,,,,,,,,,
9,,,,,,,,,,,...,977,,,,,,,,,


In [47]:
region_codes_df = region_codes_df.melt(var_name='region', value_name='region_code').dropna().reset_index(drop=True)
region_codes_df

Unnamed: 0,region,region_code
0,Republic of Adygea,01
1,Altai Republic,04
2,Republic of Bashkortostan,02
3,Republic of Bashkortostan,102
4,Republic of Bashkortostan,702
...,...,...
155,Khanty-Mansi Autonomous Okrug,186
156,Chukotka Autonomous Okrug,87
157,Yamalo-Nenets Autonomous Okrug,89
158,Baikonur,94


In [49]:
region_codes_df['region_code'] = region_codes_df['region_code'].astype(str)

In [51]:
region_codes_df.head(5)

Unnamed: 0,region,region_code
0,Republic of Adygea,1
1,Altai Republic,4
2,Republic of Bashkortostan,2
3,Republic of Bashkortostan,102
4,Republic of Bashkortostan,702


In [53]:
region_codes_df.dtypes

region         object
region_code    object
dtype: object

In [55]:
records = []

for (letters, (num_from, num_to), region), (description, is_forbidden, road_advantage, significance) in GOVERNMENT_CODES.items():
    records.append({
        "letters": letters,
        "number_from": num_from,
        "number_to": num_to,
        "region": region,
        "description": description,
        "is_forbidden": is_forbidden,
        "road_advantage": road_advantage,
        "significance": significance
    })

In [57]:
records

[{'letters': 'AMP',
  'number_from': 0,
  'number_to': 999,
  'region': '97',
  'description': 'Government of Russia',
  'is_forbidden': 1,
  'road_advantage': 1,
  'significance': 10},
 {'letters': 'AMP',
  'number_from': 0,
  'number_to': 999,
  'region': '77',
  'description': 'Partially Government of Russia',
  'is_forbidden': 0,
  'road_advantage': 1,
  'significance': 8},
 {'letters': 'EKX',
  'number_from': 0,
  'number_to': 999,
  'region': '77',
  'description': 'Partially Federal Protective Service (Federal Protective Service)',
  'is_forbidden': 0,
  'road_advantage': 1,
  'significance': 6},
 {'letters': 'EKX',
  'number_from': 0,
  'number_to': 999,
  'region': '97',
  'description': 'Partially Federal Protective Service (Federal Protective Service)',
  'is_forbidden': 0,
  'road_advantage': 1,
  'significance': 6},
 {'letters': 'EKX',
  'number_from': 0,
  'number_to': 999,
  'region': '99',
  'description': 'Partially Federal Protective Service (Federal Protective Servic

In [59]:
government_codes_df = pd.DataFrame(records)
government_codes_df.head()

Unnamed: 0,letters,number_from,number_to,region,description,is_forbidden,road_advantage,significance
0,AMP,0,999,97,Government of Russia,1,1,10
1,AMP,0,999,77,Partially Government of Russia,0,1,8
2,EKX,0,999,77,Partially Federal Protective Service (Federal ...,0,1,6
3,EKX,0,999,97,Partially Federal Protective Service (Federal ...,0,1,6
4,EKX,0,999,99,Partially Federal Protective Service (Federal ...,0,1,6


In [61]:
import re

def parse_plate(plate):
    match = re.match(r'([A-Z])(\d{3})([A-Z]{2})(\d{2,3})$', plate)
    if match:
        return match.groups()  # letters1, number, letters2, region_code
    return None, None, None, None

In [63]:
train[['letter1', 'number', 'letter2', 'region_code']] = (train['plate'].apply(lambda p: pd.Series(parse_plate(p))))

In [64]:
test[['letter1', 'number', 'letter2', 'region_code']] = (test['plate'].apply(lambda p: pd.Series(parse_plate(p))))

In [67]:
train.head()

Unnamed: 0,id,plate,date,price,letter1,number,letter2,region_code
0,1,X059CP797,2024-12-26 00:00:00,65000,X,59,CP,797
1,2,Y800MH790,2024-07-12 21:31:37,100000,Y,800,MH,790
2,3,A212TX77,2024-04-18 00:00:00,290000,A,212,TX,77
3,4,P001AY199,2025-01-03 00:27:15,680000,P,1,AY,199
4,5,P001AY199,2025-01-10 09:32:41,750000,P,1,AY,199


In [69]:
test.head()

Unnamed: 0,id,plate,date,price,letter1,number,letter2,region_code
0,51636,P700TT790,2025-01-27 00:00:00,,P,700,TT,790
1,51637,M081TX797,2025-02-10 00:00:00,,M,81,TX,797
2,51638,T333HX777,2025-02-11 00:00:00,,T,333,HX,777
3,51639,H744BH977,2025-02-03 00:00:00,,H,744,BH,977
4,51640,X066EM777,2025-02-12 00:00:00,,X,66,EM,777


In [71]:
train['plate_letters'] = train['letter1'] + train['letter2']

In [73]:
train['plate_number'] = pd.to_numeric(train['number'], errors='coerce')

In [75]:
train['region_code'] = train['region_code'].astype(str)

In [77]:
government_codes_df['region'] = government_codes_df['region'].astype(str)

In [79]:
merged = train.merge(government_codes_df, left_on=['plate_letters', 'region_code'], right_on=['letters', 'region'],
    how='left', suffixes=('', '_gov'))

In [81]:
merged['is_government'] = ((merged['plate_number'] >= merged['number_from']) & (merged['plate_number'] <= merged['number_to']))

In [83]:
merged['is_government'] = merged['is_government'].fillna(False)

In [85]:
train['is_government'] = merged['is_government']
train = train.drop(['letter1', 'letter2', 'number'], axis=1)
train.head()

Unnamed: 0,id,plate,date,price,region_code,plate_letters,plate_number,is_government
0,1,X059CP797,2024-12-26 00:00:00,65000,797,XCP,59,False
1,2,Y800MH790,2024-07-12 21:31:37,100000,790,YMH,800,False
2,3,A212TX77,2024-04-18 00:00:00,290000,77,ATX,212,False
3,4,P001AY199,2025-01-03 00:27:15,680000,199,PAY,1,False
4,5,P001AY199,2025-01-10 09:32:41,750000,199,PAY,1,False


In [87]:
test['plate_letters'] = test['letter1'] + test['letter2']

In [89]:
test['plate_number'] = pd.to_numeric(test['number'], errors='coerce')

In [91]:
test['region_code'] = test['region_code'].astype(str)

In [93]:
government_codes_df['region'] = government_codes_df['region'].astype(str)

In [95]:
merged = test.merge(government_codes_df, left_on=['plate_letters', 'region_code'], right_on=['letters', 'region'], how='left',
    suffixes=('', '_gov'))

In [97]:
merged['is_government'] = ((merged['plate_number'] >= merged['number_from']) & (merged['plate_number'] <= merged['number_to']))

In [99]:
merged['is_government'] = merged['is_government'].fillna(False)

In [101]:
test['is_government'] = merged['is_government']
test = test.drop(['letter1', 'letter2', 'number'], axis=1)

In [103]:
train = train.merge(region_codes_df, on='region_code', how='left')
test = test.merge(region_codes_df, on='region_code', how='left')

In [105]:
train.head()

Unnamed: 0,id,plate,date,price,region_code,plate_letters,plate_number,is_government,region
0,1,X059CP797,2024-12-26 00:00:00,65000,797,XCP,59,False,Moscow
1,2,Y800MH790,2024-07-12 21:31:37,100000,790,YMH,800,False,Moscow Oblast
2,3,A212TX77,2024-04-18 00:00:00,290000,77,ATX,212,False,Moscow
3,4,P001AY199,2025-01-03 00:27:15,680000,199,PAY,1,False,Moscow
4,5,P001AY199,2025-01-10 09:32:41,750000,199,PAY,1,False,Moscow


In [107]:
test.head()

Unnamed: 0,id,plate,date,price,region_code,plate_letters,plate_number,is_government,region
0,51636,P700TT790,2025-01-27 00:00:00,,790,PTT,700,False,Moscow Oblast
1,51637,M081TX797,2025-02-10 00:00:00,,797,MTX,81,False,Moscow
2,51638,T333HX777,2025-02-11 00:00:00,,777,THX,333,False,Moscow
3,51639,H744BH977,2025-02-03 00:00:00,,977,HBH,744,False,Moscow
4,51640,X066EM777,2025-02-12 00:00:00,,777,XEM,66,False,Moscow


In [109]:
cols = ['plate_letters', 'is_government', 'region']

for col in cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

In [111]:
all_plates = pd.concat([train['plate'], test['plate']]).astype(str)
all_plates

0       X059CP797
1       Y800MH790
2        A212TX77
3       P001AY199
4       P001AY199
          ...    
7690    X799CC799
7691    K077YT777
7692    O200PA777
7693     O073OO97
7694    A666YE790
Name: plate, Length: 59335, dtype: object

In [113]:
le = LabelEncoder()
le.fit(all_plates)
train['plate'] = le.transform(train['plate'].astype(str))
test['plate'] = le.transform(test['plate'].astype(str))

In [115]:
def date_change(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df = df.drop(['date'], axis=1)
    return df

In [117]:
train = date_change(train)
test = date_change(test)

In [119]:
train['region_code'] = train['region_code'].astype('int64')
test['region_code'] = test['region_code'].astype('int64')

In [121]:
train.head()

Unnamed: 0,id,plate,price,region_code,plate_letters,plate_number,is_government,region,year,month
0,1,44645,65000,797,1472,59,0,35,2024,12
1,2,50771,100000,790,1660,800,0,36,2024,7
2,3,2152,290000,77,118,212,0,35,2024,4
3,4,36260,680000,199,1163,1,0,35,2025,1
4,5,36260,750000,199,1163,1,0,35,2025,1


In [123]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred)
    smape_val = np.where(denominator == 0, 0, diff / denominator)
    return np.mean(smape_val) * 100
smape_scorer = make_scorer(smape, greater_is_better=False)

In [125]:
test_ids = test['id']
y = train['price']
X = train[['plate', 'region_code', 'plate_letters', 'plate_number', 'is_government', 'region', 'year', 'month']]

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X_train), len(y_train), len(X_test), len(y_test)

(41312, 41312, 10328, 10328)

In [129]:
correlation = X.assign(target=y).corr()['target'].drop('target')
print(correlation.sort_values(ascending=False))

year             0.046109
plate_number     0.025181
is_government    0.011925
region          -0.004331
month           -0.006424
plate           -0.033682
plate_letters   -0.034453
region_code     -0.115595
Name: target, dtype: float64


In [131]:
model = XGBRegressor(colsample_bytree=1.0, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=1500, reg_alpha=0, reg_lambda=2, subsample=0.8)
model.fit(X_train, y_train, verbose=False)
pred = model.predict(X_test)
smape(y_test, pred)

62.876972203533285

In [132]:
test_sub = test[['plate', 'region_code', 'plate_letters', 'plate_number', 'is_government', 'region', 'year', 'month']]

In [135]:
submission_predictions = model.predict(test_sub)

In [137]:
submission = pd.DataFrame({'id': test_ids.values, 'price': submission_predictions})
submission.head(5)

Unnamed: 0,id,price
0,51636,68651.117188
1,51637,100743.0625
2,51638,496677.65625
3,51639,69703.835938
4,51640,479860.15625


In [139]:
submission.to_csv('Solution.csv', index = False)