In [1]:
!pip install pymystem3

Collecting pymystem3
  Downloading pymystem3-0.2.0-py3-none-any.whl (10 kB)
Installing collected packages: pymystem3
Successfully installed pymystem3-0.2.0


In [2]:
# !pip install pymorphy2
# !pip install pymorphy2-dicts-ru
# !pip install DAWG-Python

In [3]:
import numpy as np
import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC

from pymystem3 import Mystem
# from pymorphy2 import MorphAnalyzer
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from string import punctuation
from tqdm import tqdm

In [4]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/texts-classification-iad-hse-intro-2020/test.csv
/kaggle/input/texts-classification-iad-hse-intro-2020/sample_submission.csv
/kaggle/input/texts-classification-iad-hse-intro-2020/train.csv
/kaggle/input/preprocesseddata/df_test_preprocessed.csv
/kaggle/input/preprocesseddata/df_train_preprocessed.csv


In [5]:
DATA_DIR = '/kaggle/input/'

df_train = pd.read_csv(os.path.join(DATA_DIR, 'preprocesseddata/df_train_preprocessed.csv'), dtype={'Category' : 'uint8'})
df_test = pd.read_csv(os.path.join(DATA_DIR, 'preprocesseddata/df_test_preprocessed.csv'), dtype={'itemid' : 'uint32'})
# sub = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'), dtype={'itemid' : 'uint32', 'Category' : 'uint8'})

In [6]:
df_train.head()

Unnamed: 0,Category_name,Category,title&description
0,Запчасти и аксессуары,10,эбу renault nissan dci delphi год комплект эбу...
1,Бытовая техника,21,утюг утп ватт ссср год продавать фото утюг утп...
2,Предложение услуг,114,возвма машина выкуп взять машина аренда послед...
3,"Одежда, обувь, аксессуары",27,полусапожки полусапожки отличный состояние оде...
4,"Одежда, обувь, аксессуары",27,босоножка кожаный кожаный натура босоножка kar...


In [7]:
df_train.shape, df_test.shape

((4234042, 3), (1411348, 2))

In [8]:
df_train['title&description'][7]

'свитер продавать свитер идеально состояние фирма benetton размер\n'

In [9]:
df_test['title&description'][418424]

'бортик кроватка балдахин конверт одеяло бортик кроватка шт балдахин конверт одеяло выписка\n'

In [10]:
rows = df_train.index[df_train.isnull().any(axis=1)]
len(rows)

216

In [11]:
df_train.drop(rows, 0, inplace=True)

In [12]:
df_train.shape

(4233826, 3)

In [13]:
len(df_train['Category'].value_counts())

50

In [14]:
df_train.isnull().sum()

Category_name        0
Category             0
title&description    0
dtype: int64

In [15]:
df_train['Category_name'].values

array(['Запчасти и аксессуары', 'Бытовая техника', 'Предложение услуг',
       ..., 'Детская одежда и обувь', 'Детская одежда и обувь',
       'Дома, дачи, коттеджи'], dtype=object)

In [16]:
%%time

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=30000)
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['title&description'])

CPU times: user 2min 29s, sys: 4.12 s, total: 2min 33s
Wall time: 2min 33s


In [17]:
X_train_tfidf = normalize(X_train_tfidf)

In [18]:
y_train = df_train['Category']

In [19]:
del df_train

In [20]:
# %%time

# lr = LogisticRegression(random_state=13)
# lr.fit(X_train_tfidf, y_train)

In [21]:
%%time

svm = LinearSVC(random_state=13)
svm.fit(X_train_tfidf, y_train)

CPU times: user 17min 37s, sys: 5.37 s, total: 17min 42s
Wall time: 17min 42s


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=13, tol=0.0001,
          verbose=0)

In [22]:
df_test.isnull().sum()

itemid                0
title&description    93
dtype: int64

In [23]:
df_test.fillna('', inplace=True)

In [24]:
df_test.isnull().sum()

itemid               0
title&description    0
dtype: int64

In [25]:
%%time

X_test_tfidf = tfidf_vectorizer.transform(df_test['title&description'])

CPU times: user 52.1 s, sys: 591 ms, total: 52.7 s
Wall time: 52.7 s


In [26]:
X_test_tfidf = normalize(X_test_tfidf)

In [27]:
del df_test

In [28]:
y_test_pred = svm.predict(X_test_tfidf)

In [29]:
sub = pd.read_csv(os.path.join(DATA_DIR, 'texts-classification-iad-hse-intro-2020/sample_submission.csv'), dtype={'itemid' : 'uint32', 'Category' : 'uint8'})

In [30]:
sub.shape, y_test_pred.shape

((1411348, 2), (1411348,))

In [31]:
sub['Category'] = y_test_pred
sub.head()

Unnamed: 0,Id,Category
0,1778449823,27
1,1677656962,27
2,1758182804,32
3,1689811299,10
4,1804706240,30


In [32]:
len(sub['Category'].value_counts())

50

In [33]:
sub.to_csv('submission.csv', index=False)