# Seed

In [None]:
import random
import numpy as np


seed = 42
random.seed(seed)
np.random.seed(seed)

# Data

## Data Info

In [None]:
import pandas as pd


data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,id,ticket_id,ticket_type_nm,entrance_id,entrance_nm,station_id,station_nm,line_id,line_nm,pass_dttm,time_to_under,label
0,1,40BD89EC85646EFB69E283F39C298E60,Пропуск FacePay,2402,Лефортово БКЛ,11007,Лефортово,11,Большая кольцевая,2022-09-12 05:00:13,216.316667,8001
1,2,126727A96489CC976A8C08E5CEB00542,СК учащегося 30 дней,110,Войковская ( Южный ),2006,Войковская,2,Замоскворецкая,2022-09-12 05:00:54,648.183333,9011
2,3,D28CE6A9E0E5B6D213470A97CFF32485,БСК дружинника г.Москвы,110,Войковская ( Южный ),2006,Войковская,2,Замоскворецкая,2022-09-12 05:00:55,865.333333,7022
3,4,015DA44B523C062B5BFEFF3FB0E64B9E,30 дней,110,Войковская ( Южный ),2006,Войковская,2,Замоскворецкая,2022-09-12 05:01:13,1048.233333,2022
4,5,95B19C6F3A504727AC3EA56EB7E3E80F,КОШЕЛЕК,110,Войковская ( Южный ),2006,Войковская,2,Замоскворецкая,2022-09-12 05:02:55,965.6,2017


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091021 entries, 0 to 1091020
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   id              1091021 non-null  int64  
 1   ticket_id       1091021 non-null  object 
 2   ticket_type_nm  1091021 non-null  object 
 3   entrance_id     1091021 non-null  int64  
 4   entrance_nm     1091021 non-null  object 
 5   station_id      1091021 non-null  int64  
 6   station_nm      1091021 non-null  object 
 7   line_id         1091021 non-null  int64  
 8   line_nm         1091021 non-null  object 
 9   pass_dttm       1091021 non-null  object 
 10  time_to_under   1091021 non-null  float64
 11  label           1091021 non-null  int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 99.9+ MB


In [None]:
data['label'].value_counts()

3002     17272
6008     15541
1022     13758
5010     13122
7022     13080
         ...  
13002       24
13003       23
13006       17
13005       14
13004        7
Name: label, Length: 276, dtype: int64

## Data Upd

In [None]:
from datetime import datetime
from datetime import timedelta
import numpy as np


def dttm_weekday(dttm):
    dttm = np.array(dttm)
    weekdays = np.zeros(len(dttm)).astype(int)
    for i, date in enumerate(dttm):
        weekdays[i] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S').weekday()
    return weekdays


def dttm_seconds(dttm):
    dttm = np.array(dttm)
    seconds = np.zeros(len(dttm)).astype(int)
    for i, date in enumerate(dttm):
        dt = datetime.strptime(date.split()[1], '%H:%M:%S')
        seconds[i] = timedelta(hours=dt.hour, minutes=dt.minute, seconds=dt.second).total_seconds()
    return seconds


def rush_hour(dttm):
    dttm = np.array(dttm)
    r_hour = np.zeros(len(dttm)).astype(int)
    for i, date in enumerate(dttm):
        dt = datetime.strptime(date.split()[1], '%H:%M:%S')
        if 7 <= dt.hour <= 10 or 17 <= dt.hour <=20:
            r_hour[i] = 1
    return r_hour


def dttm_am_pm(dttm):
    dttm = np.array(dttm)
    r_hour = np.zeros(len(dttm)).astype(int)
    for i, date in enumerate(dttm):
        dt = datetime.strptime(date.split()[1], '%H:%M:%S')
        if dt.hour >= 12:
            r_hour[i] = 1
    return r_hour

In [None]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
data['ticket_type_nm'] = label_encoder.fit_transform(data['ticket_type_nm'])

data = data.drop(columns=['id', 'ticket_id', 'entrance_nm', 'station_nm', 'line_nm'])
data['weekday'] = dttm_weekday(data['pass_dttm'])
data['seconds'] = dttm_seconds(data['pass_dttm'])
data['rush_hour'] = rush_hour(data['pass_dttm'])
data['am_pm'] = dttm_am_pm(data['pass_dttm'])
data = data.drop(columns=['pass_dttm'])

In [None]:
data.head()

Unnamed: 0,ticket_type_nm,entrance_id,station_id,line_id,time_to_under,label,weekday,seconds,rush_hour,am_pm
0,40,2402,11007,11,216.316667,8001,0,18013,0,0
1,53,110,2006,2,648.183333,9011,0,18054,0,0
2,26,110,2006,2,865.333333,7022,0,18055,0,0
3,3,110,2006,2,1048.233333,2022,0,18073,0,0
4,38,110,2006,2,965.6,2017,0,18175,0,0


# Regression 

## Dataset

In [None]:
from sklearn.model_selection import train_test_split


X_train_rg, X_valid_rg, y_train_rg, y_valid_rg = train_test_split(np.array(data.drop(columns=['time_to_under', 
                                                                                              'label'])),
                                                                  np.array(data['time_to_under']),
                                                                  test_size=0.2,
                                                                  shuffle=True,
                                                                  random_state=seed)

## Train

In [None]:
from sklearn.metrics import r2_score


def score(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    print('r2_score:', r2_score(y_valid, preds))

In [None]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor


rm_1 = LGBMRegressor(random_state=seed)
rm_2 = XGBRegressor(random_state=seed)
rm_3 = CatBoostRegressor(random_seed=seed, task_type="GPU")
reg_model = VotingRegressor([('lgb', rm_1), ('xgb', rm_2), ('cbr', rm_3)])
reg_model.fit(X_train_rg, y_train_rg)
score(reg_model, X_valid_rg, y_valid_rg)

    r2_score: 0.5375358951455785

# Classification

## Dataset

In [None]:
# from imblearn.over_sampling import SMOTE


# smote = SMOTE(random_state=seed)
# X_clf = data.drop(columns=['time_to_under', 'label'])
# y_clf = data['label']
# X_resample, y_resample = smote.fit_resample(X_clf, y_clf)
# X_resample.reset_index(drop=True, inplace=True)
# y_resample.reset_index(drop=True, inplace=True)
# data_resample = pd.concat([X_resample, y_resample], axis=1)

In [None]:
#data_clf = data_resample.sample(1000000, random_state=seed)
#data_clf.hist(bins = 50, figsize = (20,20))

In [None]:
from sklearn.model_selection import train_test_split


X_train_clf, X_valid_clf, y_train_clf, y_valid_clf = train_test_split(np.array(data.drop(columns=['time_to_under', 
                                                                                                  'label'])),
                                                                      np.array(data['label']),
                                                                      test_size=0.2,
                                                                      shuffle=True,
                                                                      stratify=np.array(data['label']),
                                                                      random_state=seed)

## Train

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score, precision_score


def score(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    print(classification_report(y_valid, preds))
    print('recall: ', recall_score(y_valid, preds, average="macro", zero_division=0))

In [None]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier


#cm_1 = XGBClassifier(objective='multi:softmax', verbose=2, tree_method='gpu_hist')
#cm_2 = CatBoostClassifier(loss_function='MultiClass', task_type="GPU")
#clf_model = VotingClassifier(estimators=[('lgb', cm_1), ('cbc', cm_2)], voting='soft')
clf_model = CatBoostClassifier(iterations=1024, loss_function='MultiClass', task_type="GPU", random_state=seed)
clf_model.fit(X_train_clf, y_train_clf)
score(clf_model, X_valid_clf, y_valid_clf)

    recall:  0.08138790187011342

## Save

In [None]:
clf_model.save_model('clf_model', format="cbm")

# Test

In [None]:
test_data = pd.read_csv('data/test.csv')

ids = test_data['id']
test_data['ticket_type_nm'] = label_encoder.fit_transform(test_data['ticket_type_nm'])

test_data = test_data.drop(columns=['id', 'ticket_id', 'entrance_nm', 'station_nm', 'line_nm'])
test_data['weekday'] = dttm_weekday(test_data['pass_dttm'])
test_data['seconds'] = dttm_seconds(test_data['pass_dttm'])
test_data['rush_hour'] = rush_hour(test_data['pass_dttm'])
test_data['am_pm'] = dttm_am_pm(test_data['pass_dttm'])
test_data = test_data.drop(columns=['pass_dttm'])

In [None]:
import torch

X_test = np.array(test_data)
preds_time = torch.tensor(reg_model.predict(X_test)).reshape(-1)
preds_label = torch.tensor(clf_model.predict(X_test)).reshape(-1)
result = pd.DataFrame({'id': ids, 'time_to_under': preds_time, 'label': preds_label})
result.to_csv('solution.csv', index=False)
result

Unnamed: 0,id,time_to_under,label
0,1091022,628.811748,9010
1,1091023,617.415113,1019
2,1091024,607.316509,2017
3,1091025,629.111442,9010
4,1091026,628.811748,9010
...,...,...,...
561885,1652907,534.374803,8008
561886,1652908,563.619449,7004
561887,1652909,513.640514,6002
561888,1652910,526.018571,9010
