In [64]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import FeatureUnion
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
train = pd.read_csv('train.csv', index_col=0, parse_dates=['birthday', 'date'])
test = pd.read_csv('test.csv', index_col=0, parse_dates=['birthday', 'date'])

In [26]:
train['service_lemm'] = train['service_lemm'].fillna('')
test['service_lemm'] = test['service_lemm'].fillna('')

In [27]:
from ast import literal_eval

train['target'] = train['target'].apply(lambda x: literal_eval(x) if x is not np.nan else x)
test['target'] = test['target'].apply(lambda x: literal_eval(x) if x is not np.nan else x)

In [28]:
train_sl = train[~train['target'].isna()]

In [29]:
train_sl.shape

(63646, 12)

In [30]:
from itertools import chain
classes = np.unique(list(chain(*train_sl['target'].tolist())) + list(chain(*test['target'].tolist())))

In [31]:
classes

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'Z'], dtype='<U1')

In [32]:
from sklearn.preprocessing import MultiLabelBinarizer

mb = MultiLabelBinarizer(classes=classes)
y_train = mb.fit_transform(train_sl['target'])
y_test = mb.transform(test['target'])

In [33]:
num_cols = ['num', 'birthday_year']
cat_cols = ['service_class']
text_cols = ['service_lemm']

In [34]:
X_cols = num_cols + cat_cols + text_cols
X_train = train_sl[X_cols]
X_test = test[X_cols]

In [35]:
num_indices = np.argwhere(X_train.columns.isin(num_cols)).flatten()
cat_indices = np.argwhere(X_train.columns.isin(cat_cols)).flatten()
text_indices = np.argwhere(X_train.columns.isin(text_cols)).flatten()

In [36]:
X_train.head()

Unnamed: 0,num,birthday_year,service_class,service_lemm
129272,1,1968,4111,экг 12 ти отведение снятие расшифровка
51064,1,1989,3100,общий клинический анализ кровь
137191,1,1985,1810,забор материал рост флора чувствительность ант...
82671,1,2017,3100,общеклинический исследование мочь микроскопия
68484,1,1979,4500,эзофагоскопия лечебный диагностический т ч био...


In [74]:
def test_model(pl):
    y_train_pred = pl.predict(X_train.values)
    y_pred = pl.predict(X_test.values)
    print('Train:\n')
    print(classification_report(y_train, y_train_pred, zero_division=0, target_names=mb.classes_))
    print('\n\nTest:\n')
    print(classification_report(y_test, y_pred, zero_division=0, target_names=mb.classes_))

# KNN

In [75]:
pl = Pipeline([
    ('preprocess', FeatureUnion([
        ('num', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, num_indices], validate=False)),
            ('scaler', StandardScaler())
        ])),
        ('cat', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, cat_indices], validate=False)),
            ('onehot', OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'))
        ])),
        ('text', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, text_indices[0]], validate=False)),
            ('tfidf', TfidfVectorizer(min_df=50)),
            ('toarray', FunctionTransformer(lambda X: X.toarray(), validate=False, accept_sparse=True))
        ]))
    ])),
    ('pca', PCA(100)),
    ('model', KNeighborsClassifier(n_jobs=-1))
])

In [76]:
%%time
pl = pl.fit(X_train.values, y_train)

CPU times: user 20.9 s, sys: 857 ms, total: 21.8 s
Wall time: 7.26 s


In [77]:
test_model(pl)

Train:

              precision    recall  f1-score   support

           A       0.60      0.03      0.06       279
           B       0.73      0.18      0.28       617
           C       0.00      0.00      0.00        26
           D       0.65      0.15      0.24      1930
           E       0.89      0.64      0.74      3966
           F       0.86      0.08      0.15        71
           G       0.71      0.33      0.45      1947
           H       0.91      0.83      0.87      4056
           I       0.76      0.54      0.63      5388
           J       0.78      0.67      0.72      8583
           K       0.90      0.77      0.83     12368
           L       0.72      0.44      0.55      1732
           M       0.87      0.77      0.82     10557
           N       0.86      0.80      0.83     10739
           O       0.00      0.00      0.00         6
           P       0.00      0.00      0.00         0
           Q       0.00      0.00      0.00        39
           R       

# Random Forest

In [78]:
pl = Pipeline([
    ('preprocess', FeatureUnion([
        ('num', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, num_indices], validate=False)),
            ('scaler', StandardScaler())
        ])),
        ('cat', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, cat_indices], validate=False)),
            ('onehot', OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'))
        ])),
        ('text', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, text_indices[0]], validate=False)),
            ('tfidf', TfidfVectorizer(min_df=50)),
            ('toarray', FunctionTransformer(lambda X: X.toarray(), validate=False, accept_sparse=True))
        ]))
    ])),
    ('pca', PCA(100)),
    ('model', RandomForestClassifier(n_jobs=-1))
])

In [79]:
%%time
pl = pl.fit(X_train.values, y_train)

CPU times: user 7min 37s, sys: 22.7 s, total: 8min
Wall time: 1min 26s


In [80]:
test_model(pl)

Train:

              precision    recall  f1-score   support

           A       1.00      0.08      0.15       279
           B       0.98      0.26      0.41       617
           C       1.00      0.12      0.21        26
           D       0.99      0.18      0.31      1930
           E       0.94      0.64      0.76      3966
           F       1.00      0.23      0.37        71
           G       0.94      0.26      0.41      1947
           H       0.96      0.83      0.89      4056
           I       0.89      0.53      0.66      5388
           J       0.88      0.64      0.74      8583
           K       0.95      0.78      0.86     12368
           L       0.87      0.47      0.61      1732
           M       0.89      0.79      0.84     10557
           N       0.91      0.82      0.86     10739
           O       0.00      0.00      0.00         6
           P       0.00      0.00      0.00         0
           Q       1.00      0.10      0.19        39
           R       

# Random Forest (no PCA)

In [81]:
pl = Pipeline([
    ('preprocess', FeatureUnion([
        ('num', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, num_indices], validate=False)),
            ('scaler', StandardScaler())
        ])),
        ('cat', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, cat_indices], validate=False)),
            ('onehot', OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'))
        ])),
        ('text', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, text_indices[0]], validate=False)),
            ('tfidf', TfidfVectorizer(min_df=50)),
            ('toarray', FunctionTransformer(lambda X: X.toarray(), validate=False, accept_sparse=True))
        ]))
    ])),
    ('model', RandomForestClassifier(n_jobs=-1))
])

In [82]:
%%time
pl = pl.fit(X_train.values, y_train)

CPU times: user 4min 14s, sys: 21.8 s, total: 4min 36s
Wall time: 50.2 s


In [83]:
test_model(pl)

Train:

              precision    recall  f1-score   support

           A       0.98      0.44      0.61       279
           B       0.94      0.58      0.71       617
           C       1.00      0.65      0.79        26
           D       0.92      0.53      0.68      1930
           E       0.93      0.77      0.84      3966
           F       0.98      0.73      0.84        71
           G       0.89      0.58      0.70      1947
           H       0.97      0.91      0.94      4056
           I       0.90      0.73      0.81      5388
           J       0.88      0.79      0.83      8583
           K       0.95      0.87      0.91     12368
           L       0.91      0.70      0.79      1732
           M       0.93      0.87      0.90     10557
           N       0.93      0.89      0.91     10739
           O       1.00      0.67      0.80         6
           P       0.00      0.00      0.00         0
           Q       1.00      0.44      0.61        39
           R       