In [62]:
%reset -f
import pandas as pd
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk import tokenize
from nltk.corpus import stopwords
from nltk import stem

from pymystem3 import Mystem
from string import punctuation
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

### Functions

In [63]:
# Preprocessing contents

def clean_tags(text):
    text = re.sub('<[^>]*>', '', text)
    return text


def preprocess_text(text, mode):
    assert mode in ('lemm', 'stem')
    
    text = text.lower()
    text = re.sub("[^а-яА-Яa-zA-Z]", " ", text)
    
    if mode == 'lemm':
        tokens = mystem.lemmatize(text)
        tokens = [token for token in tokens if token not in russian_stopwords\
                  and len(token) > 2 \
                  and token.strip() not in punctuation]
        
    else:
        tokens = text.split()
        tokens = [stemmer_rus.stem(token) for token in tokens if token not in russian_stopwords\
                  and len(token) > 2 \
                  and token.strip() not in punctuation]
        
    text = " ".join(tokens)
    
    return text


def preprocess_contents(X, mode):
    for num, row in X.iterrows():
        content = row['content']

        content = content.replace('ё', 'е').replace('Ё', 'Е')
        
        content = clean_tags(content)
        content = preprocess_text(content, mode)
        
        X.at[num, 'content'] = content
        print(num)
        
    return X

In [64]:
# Prepare contents for VW

def prepare_X_train_vectorized():
    with open('in/train_input.vw', "w") as f_out:
        for num, row in X_train.iterrows():
            h = y_train[num]
            
            line = str(hub_to_label[h]) + " | " + \
                ' '.join([str(idx)+':'+str(tf) for idx, tf in zip(W_train[num].indices, W_train[num].data) if tf > 0])
            
            f_out.write(line + "\n")

            
def prepare_X_test_vectorized():
    with open('in/test_input.vw', "w") as f_out:
        for num, row in X_test.iterrows():
            
            line = "| " + \
            ' '.join([str(idx)+':'+str(tf) for idx, tf in zip(W_test[num].indices, W_test[num].data) if tf > 0])
            
            f_out.write(line + "\n")
            
            
def prepare_X_test_glob_vectorized():
    with open('in/test_glob_input.vw', "w") as f_out:
        for num, row in X_test_glob.iterrows():
            
            line = "| " + \
            ' '.join([str(idx)+':'+str(tf) for idx, tf in zip(W_test_glob[num].indices, W_test_glob[num].data) if tf > 0])
            
            f_out.write(line + "\n")
            
            
def make_vw_input(dtype):
    assert dtype in ('train', 'test', 'test_glob')
    
    if dtype == 'train':
        prepare_X_train_vectorized()
    elif dtype == 'test':
        prepare_X_test_vectorized()
    else:
        prepare_X_test_glob_vectorized()

In [65]:
# Prepare tags for VW

#return tags array
def extract_tags(tags_str):
    tag_arr = tags_str.split("'")[1::2]
    res = []
    for tag in tag_arr:
        tokens = []
        for token in tag.split():
            token = re.sub(':', '', token)
            token = re.sub("[^а-яА-Яa-zA-Z]", "", token)
            if len(token) > 2:
                tokens.append(token)
        new_tag = '_'.join( tokens )
        
        if len(new_tag) > 2:
            res.append(new_tag)

    return res


def get_add_str(X, i):
    tags_str = X['tags'][i]
    string_to_add = ' ' + ' '.join([tag + ':' + str(tag_weight) for tag in extract_tags(tags_str)])
    return string_to_add


def add_tags(dtype):
    assert dtype in ('train', 'test', 'test_glob')
    
    if dtype == 'train':
        file_name = 'in/train_input.vw'
        X = X_train
    elif dtype == 'test':
        file_name = 'in/test_input.vw'
        X = X_test
    else:
        file_name = 'in/test_glob_input.vw'
        X = X_test_glob
        
    
    with open(file_name, 'rt') as fp:
        lines = [''.join([line.strip(), get_add_str(X, i), '\n']) for i, line in enumerate(fp.readlines())]

    with open(file_name + '_tagged', 'wt') as fp:
        fp.writelines(lines)

### Preprocess data

In [None]:
# this takes a lot of time, use prediction_file

In [320]:
X_train = pd.read_csv('data/train_set.csv')
X_test_glob = pd.read_csv('data/test_set.csv')

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

tmp_train = preprocess_contents(X_train, 'lemm')
tmp_train.to_csv('data/train_set_preproc.csv', encoding='utf8')

tmp_test_glob = preprocess_contents(X_test, 'lemm')
tmp_test_glob.to_csv('data/test_set_preproc.csv', encoding='utf8')

### OR Load data

In [66]:
# Load data

X_train = pd.read_csv('data/train_set_preproc.csv')
X_test_glob = pd.read_csv('data/test_set_preproc.csv')

#shuffle
X_train = X_train.sample(frac=1, random_state=2281488).reset_index(drop=True)

### Clean data

In [67]:
X_train = X_train.drop_duplicates(subset='title').reset_index(drop=True)

In [68]:
X_train = X_train.drop_duplicates(subset='content').reset_index(drop=True)

In [69]:
X_train = X_train.dropna(subset=['content']).reset_index(drop=True)

In [70]:
# NaN in test set
X_test_glob['content'].fillna('', inplace=True)

In [71]:
y_train = X_train.hub.values
X_train = X_train.drop("hub", 1).reset_index(drop=True)

### Mapping

In [72]:
hub_to_label = {}
label_to_hub = {}
for i, hub in enumerate(set(y_train)):
    hub_to_label[hub] = i+1
    label_to_hub[i+1] = hub

### Vectorize

In [73]:
texts_train = X_train['content'].values
texts_test_glob = X_test_glob['content'].values

In [74]:
%%time
vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf=True, use_idf=True, stop_words=None)

W_train = vectorizer.fit_transform(texts_train)
W_test_glob = vectorizer.transform(texts_test_glob)

CPU times: user 27.9 s, sys: 396 ms, total: 28.3 s
Wall time: 28.5 s


### Make files for VW

In [75]:
%%time
make_vw_input('train')
make_vw_input('test_glob')

CPU times: user 1min 2s, sys: 580 ms, total: 1min 3s
Wall time: 1min 4s


In [76]:
tag_weight = 0.5
    
add_tags('train')
add_tags('test_glob')

### Train

In [77]:
!rm cache.q
    
!vw --oaa 20 -d in/train_input.vw_tagged -f in/vw.model --loss_function logistic \
\
--learning_rate=0.14 --quiet \
\
--passes 10 --cache_file cache.q --ngram 1 --holdout_off

rm: cannot remove 'cache.q': No such file or directory


### Predict

In [78]:
!vw -t -i in/vw.model in/test_glob_input.vw_tagged -p out/predict.vw --quiet

In [90]:
y_pred = []
with open("out/predict.vw") as f_in:
    for line in f_in:
        y_pred.append(int(line.strip()))

In [91]:
with open('out/sample_submission.csv', "w") as f_out:
    for label_num in y_pred:
        hub_pred = label_to_hub[label_num]
        f_out.write(hub_pred  + "\n")

### Testing

In [97]:
y_pred = []
with open('out/sample_submission.csv') as f_in:
    for label_num in f_in:
        y_pred.append(label_num.strip())

In [341]:
# provide this

y_test = []
with open("data/label_test.csv") as f_in:
    for label_num in f_in:
        y_test.append(label_num.strip())

### Accuracy

In [102]:
accuracy_score(y_test, y_pred)

Score ~ 0.67...0.71