In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import FeatureUnion
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
train = pd.read_csv('train.csv', index_col=0, parse_dates=['birthday', 'date'])
test = pd.read_csv('test.csv', index_col=0, parse_dates=['birthday', 'date'])

In [3]:
train['service_lemm'] = train['service_lemm'].fillna('')
test['service_lemm'] = test['service_lemm'].fillna('')

In [4]:
from ast import literal_eval

train['target'] = train['target'].apply(lambda x: literal_eval(x) if x is not np.nan else x)
test['target'] = test['target'].apply(lambda x: literal_eval(x) if x is not np.nan else x)

In [5]:
train_sl = train[~train['target'].isna()]

In [6]:
train_sl.shape

(63646, 12)

In [7]:
from itertools import chain
classes = np.unique(list(chain(*train_sl['target'].tolist())) + list(chain(*test['target'].tolist())))

In [8]:
classes

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'Z'], dtype='<U1')

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

mb = MultiLabelBinarizer(classes=classes)
y_train = mb.fit_transform(train_sl['target'])
y_test = mb.transform(test['target'])

In [10]:
num_cols = ['num', 'birthday_year']
cat_cols = ['service_class']
text_cols = ['service_lemm']

In [28]:
X_cols = num_cols + cat_cols + text_cols
X_train = train_sl.loc[:, X_cols]
X_test = test.loc[:, X_cols]

In [12]:
num_indices = np.argwhere(X_train.columns.isin(num_cols)).flatten()
cat_indices = np.argwhere(X_train.columns.isin(cat_cols)).flatten()
text_indices = np.argwhere(X_train.columns.isin(text_cols)).flatten()

In [13]:
X_train.head()

Unnamed: 0,num,birthday_year,service_class,service_lemm
129272,1,1968,4111,экг 12 ти отведение снятие расшифровка
51064,1,1989,3100,общий клинический анализ кровь
137191,1,1985,1810,забор материал рост флора чувствительность ант...
82671,1,2017,3100,общеклинический исследование мочь микроскопия
68484,1,1979,4500,эзофагоскопия лечебный диагностический т ч био...


In [14]:
def test_model(pl):
    y_pred = pl.predict(X_test.values)
    print(classification_report(y_test, y_pred, zero_division=0, target_names=mb.classes_))

# Кластеризация

In [15]:
from sklearn.cluster import KMeans

In [16]:
fu = FeatureUnion([
    ('num', Pipeline([
        ('select', FunctionTransformer(lambda X: X[:, num_indices], validate=False)),
        ('scaler', StandardScaler())
    ])),
    ('cat', Pipeline([
        ('select', FunctionTransformer(lambda X: X[:, cat_indices], validate=False)),
        ('onehot', OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'))
    ])),
    ('text', Pipeline([
        ('select', FunctionTransformer(lambda X: X[:, text_indices[0]], validate=False)),
        ('tfidf', TfidfVectorizer(min_df=50)),
        ('toarray', FunctionTransformer(lambda X: X.toarray(), validate=False, accept_sparse=True))
    ]))
])

In [21]:
X_train_transformed = fu.fit_transform(X_train.values)
X_test_transformed = fu.transform(X_test.values)

In [18]:
kmeans = KMeans(n_clusters=20, random_state=42, n_jobs=-1)

In [19]:
cluster_train = kmeans.fit_predict(X_train_trasnformed)
cluster_test = kmeans.predict(X_test_transformed)

In [30]:
X_train['cluster'] = cluster_train
X_test['cluster'] = cluster_test

In [34]:
num_cols = ['num', 'birthday_year']
cat_cols = ['service_class', 'cluster']
text_cols = ['service_lemm']

In [35]:
num_indices = np.argwhere(X_train.columns.isin(num_cols)).flatten()
cat_indices = np.argwhere(X_train.columns.isin(cat_cols)).flatten()
text_indices = np.argwhere(X_train.columns.isin(text_cols)).flatten()

# KNN

In [37]:
pl = Pipeline([
    ('preprocess', FeatureUnion([
        ('num', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, num_indices], validate=False)),
            ('scaler', StandardScaler())
        ])),
        ('cat', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, cat_indices], validate=False)),
            ('onehot', OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'))
        ])),
        ('text', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, text_indices[0]], validate=False)),
            ('tfidf', TfidfVectorizer(min_df=50)),
            ('toarray', FunctionTransformer(lambda X: X.toarray(), validate=False, accept_sparse=True))
        ]))
    ])),
    ('pca', PCA(100)),
    ('model', KNeighborsClassifier(n_jobs=-1))
])

In [38]:
%%time
pl = pl.fit(X_train.values, y_train)

CPU times: user 9.78 s, sys: 445 ms, total: 10.2 s
Wall time: 2.57 s


In [39]:
test_model(pl)

              precision    recall  f1-score   support

           A       1.00      0.03      0.05       119
           B       0.34      0.09      0.14       254
           C       0.00      0.00      0.00        15
           D       0.32      0.06      0.10       844
           E       0.85      0.60      0.71      1737
           F       1.00      0.10      0.18        30
           G       0.51      0.21      0.30       811
           H       0.86      0.80      0.83      1702
           I       0.65      0.44      0.52      2391
           J       0.69      0.57      0.62      3641
           K       0.85      0.74      0.79      5363
           L       0.58      0.30      0.40       758
           M       0.79      0.73      0.76      4427
           N       0.82      0.76      0.79      4486
           O       0.00      0.00      0.00         2
           P       0.00      0.00      0.00         1
           Q       0.00      0.00      0.00        14
           R       0.53    

# Random Forest

In [40]:
pl = Pipeline([
    ('preprocess', FeatureUnion([
        ('num', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, num_indices], validate=False)),
            ('scaler', StandardScaler())
        ])),
        ('cat', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, cat_indices], validate=False)),
            ('onehot', OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'))
        ])),
        ('text', Pipeline([
            ('select', FunctionTransformer(lambda X: X[:, text_indices[0]], validate=False)),
            ('tfidf', TfidfVectorizer(min_df=50)),
            ('toarray', FunctionTransformer(lambda X: X.toarray(), validate=False, accept_sparse=True))
        ]))
    ])),
    ('pca', PCA(100)),
    ('model', RandomForestClassifier(n_jobs=-1))
])

In [41]:
%%time
pl = pl.fit(X_train.values, y_train)

CPU times: user 6min 54s, sys: 31.9 s, total: 7min 26s
Wall time: 39.7 s


In [42]:
test_model(pl)

              precision    recall  f1-score   support

           A       0.67      0.03      0.06       119
           B       0.36      0.04      0.06       254
           C       0.00      0.00      0.00        15
           D       0.43      0.06      0.10       844
           E       0.90      0.60      0.72      1737
           F       0.00      0.00      0.00        30
           G       0.55      0.15      0.24       811
           H       0.90      0.80      0.85      1702
           I       0.75      0.41      0.53      2391
           J       0.77      0.56      0.65      3641
           K       0.90      0.74      0.81      5363
           L       0.68      0.34      0.45       758
           M       0.84      0.74      0.79      4427
           N       0.85      0.76      0.81      4486
           O       0.00      0.00      0.00         2
           P       0.00      0.00      0.00         1
           Q       0.00      0.00      0.00        14
           R       0.57    