## HW5: Vacancy Classification SF01

### [0] Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import pickle

import bs4
import re
from itertools import combinations

from tqdm import tqdm
tqdm.pandas();

import gensim
import pymorphy2

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline

import xgboost as xgb

In [3]:
RSTATE = 42

***

### [1] Load data

In [4]:
# train = pd.read_csv('train.csv', sep='\t', index_col='id')
# test = pd.read_csv('test.csv', sep='\t', index_col='id')

In [5]:
# other = pd.read_csv('other.csv', sep='\t')
# other.index = other.index.set_names('id')

***

### [2] Preprocess data

In [6]:
morph = pymorphy2.MorphAnalyzer()    # MorphAnalyzer instance

In [7]:
template = re.compile("([А-ЯЁа-яёA-z]+(-[А-ЯЁа-яёA-z]+)*)", re.S)    # RegExp for finding russian and english words

#### Tokenization

In [8]:
def process(sentence):
    '''
    Process one sentence (string).
    '''

    # strip html tags
    text = bs4.BeautifulSoup(sentence, 'lxml').get_text()

    # tokenize sentence and normalize words + filter out prepositions and conjunctions
    words = [(morph.parse(token[0])[0]).normal_form for token in template.findall(text)
             if morph.parse(token[0])[0].tag.POS not in ['PREP','CONJ']]

    return words

In [9]:
def tokenize_text(df):
    '''
    Strip html tags, tokenize sentences and normalize words.
    '''
    
    text_df = df.copy()
    
    # process dataframe (columns 'name', 'desciption')
    names = []
    descs = []
    for row in tqdm(text_df.itertuples()):
        names.append(process(row.name))
        descs.append(process(row.description))
        
    text_df['name'] = names
    text_df['description'] = descs
    
    del names, descs
    
    return text_df

Preprocess train set and additional data:

In [10]:
# %%time
# other_norm = other.pipe(tokenize_text)

In [11]:
# %%time
# train_norm = train.pipe(tokenize_text)

In [12]:
# %%time
# test_norm = test.pipe(tokenize_text)

Save to disk:

In [13]:
# other_norm.to_csv('other_norm.csv', sep='\t', index_label='id')
# train_norm.to_csv('train_norm.csv', sep='\t', index_label='id')
# test_norm.to_csv('test_norm.csv', sep='\t', index_label='id')

Load from disk:

In [14]:
conv = lambda s: s.strip("[]").replace("'", '').split(", ")
other_norm = pd.read_csv('other_norm.csv', sep='\t', index_col='id', converters={'name': conv, 'description': conv})
train_norm = pd.read_csv('train_norm.csv', sep='\t', index_col='id', converters={'name': conv, 'description': conv})

Split into train and validation:

In [15]:
train_df, valid_df = train_test_split(train_norm, test_size=0.1, random_state=RSTATE)

In [16]:
del train_norm

***

### [3] Train Word2Vec

#### Word2Vec will be trained on 90% of train data and additional data (file 'other.csv')

In [17]:
corpus = np.concatenate([train_df.name.values+train_df.description.values, 
                         other_norm.name.values+other_norm.description.values])

In [18]:
del other_norm

In [19]:
%%time
w2v_model = gensim.models.Word2Vec(sentences=corpus, size=100, sg=0, window=10, sample=1e-5, workers=4, 
                                   seed=RSTATE, min_count=1, hs=0, negative=5)

Wall time: 4min 8s


In [20]:
del corpus

In [21]:
# w2v_model.save('w2v_model_cbow_100.vec')

***

### [4] Identify keywords

Count word frequency by target class:

In [22]:
X = train_df['name'].map(lambda x: ' '.join(x)).values + ' ' + train_df['description'].map(lambda x: ' '.join(x)).values

In [23]:
counter_pos = CountVectorizer()
counter_neg = CountVectorizer()

In [24]:
%%time
counts_pos = counter_pos.fit_transform(X[train_df.target==1])
counts_neg = counter_neg.fit_transform(X[train_df.target==0])

Wall time: 24.8 s


In [25]:
del X, counts_pos, counts_neg

Select most frequent words from positive class as keywords:

In [26]:
def make_keywords(top, k=1):
    '''Create keywords from top positive class words.'''
    
    toppos = list(counter_pos.vocabulary_.keys())[:top]
    topneg = list(counter_neg.vocabulary_.keys())[:top*k]    # exclude top*k negative class words
    
    return [w for w in toppos if w not in topneg]

In [27]:
print(make_keywords(20, k=5))

['продавец', 'обязанность', 'консультирование', 'продажа', 'оборудование', 'приём', 'входящая', 'звонок', 'постоянный', 'поиск', 'клиент', 'потенциальный', 'исходящая', 'соблюдение', 'порядок', 'торговый', 'зал']


***

### [5] Generate features

#### Helper functions

In [28]:
def embed_doc(model, tokens, method='mean'):
    '''
    Create embedding for a single document, represented as a list of tokens.
    '''

    methods = {
        'mean': lambda x: np.mean(x, axis=0),
        'sum': lambda x: np.sum(x, axis=0),
        'mean_conv': lambda x: np.mean([np.convolve(vx, vy, mode='same') for vx, vy in combinations(x, 2)], axis=0)
    }

    vectors = []

    for token in tokens:
        try:
            vector = model[token]
        except KeyError:
            continue
        vectors.append(vector)

    if vectors == []:
        return np.zeros(model.vector_size)    # return all zeros if tokens not in dictionary
    else:
        return methods[method](vectors)

In [29]:
def cosine_to_word(model, sentence_vec, word):
    '''
    Calculate cosine similarity between sentence embedding and single word embedding.
    '''
    word_vec = model.wv[word].reshape(1, -1)
    return cosine_similarity(sentence_vec, word_vec)

#### Feature generation

In [30]:
class Word2VecEmbedding(BaseEstimator, TransformerMixin):
    '''
    Create word2vec embedding for an array of documents.
    '''
    
    def __init__(self, model=None, method='mean'):
        self.model = model
        self.method = method
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, df):
        '''Process array of documents.'''
        
        embeddings = np.zeros((df.shape[0], self.model.vector_size))
        
        for i, row in enumerate(df.itertuples()):
            embeddings[i,:] = embed_doc(w2v_model, row.name) + embed_doc(w2v_model, row.description)
            
        return embeddings

In [31]:
class KeywordsDistanceFeatures(BaseEstimator, TransformerMixin):
    '''
    Add keyword distance features to an array of word2vec-embedded documents.
    '''

    def __init__(self, keywords, model=None):
        self.model = model
        self.keywords = keywords
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, emb_docs):
        '''Keywords' "distances" from emb_docs.'''
        
        # distance between each document and each of keywords
        distances = []
        
        for kw in self.keywords:
            distances.append(np.array([cosine_to_word(w2v_model, s.reshape(1, -1), kw) for s in emb_docs])[:,0])
            
        return np.hstack([emb_docs, np.hstack(distances)])

In [32]:
class KeywordsIndicatorFeatures(BaseEstimator, TransformerMixin):
    '''
    Add keyword indicator features to an array of word2vec-embedded documents.
    '''
    
    def __init__(self, keywords):
        self.keywords = keywords
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, df):
        '''Indicator shows whether keyword is part of name/description.'''
        
        indicators = []
        
        for row in df.itertuples():
            inds = [1 if w in row.description or w in row.name else 0 for w in self.keywords]
            indicators.append(np.array(inds))
            
        return np.vstack(indicators)

In [33]:
def make_feature_pipe(model, kw_dist, kw_ind, method='mean'):
    '''Create pipeline for feature generation.'''
    
    feature_pipe = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('w2v_feats', Pipeline([
                    ('embed', Word2VecEmbedding(model=model, method=method)),
                    ('kw_dist', KeywordsDistanceFeatures(keywords=kw_dist, model=model))
                ])),
                ('kw_ind', KeywordsIndicatorFeatures(keywords=kw_ind))
            ]))
        ])
    
    return feature_pipe    

***

### [6] Validation

Train classifier:

In [34]:
custom_kw = ['продажа', 'товар', 'торговый', 'кассовый', 'клиент', 'кассир', 'клиентский', 'продавец', 'касса', 'продукция']

In [35]:
pipe = make_feature_pipe(model=w2v_model, kw_dist=custom_kw, kw_ind=custom_kw, method='mean')

In [36]:
%%time
X_train = pipe.fit_transform(train_df)

Wall time: 10min 14s


In [37]:
y_train = train_df.target.values

In [38]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=RSTATE)

In [39]:
%%time
clf.fit(X_train, y_train)

Wall time: 26min 10s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

Validation score:

In [40]:
%%time
X_valid = pipe.fit_transform(valid_df)

Wall time: 1min 14s


In [41]:
val_score = roc_auc_score(valid_df.target.values, clf.predict_proba(X_valid)[:,1])
val_score

0.9929885839020474

***

### [7] Test prediction

Refit classifier on the whole training set:

In [42]:
clf.fit(np.vstack([X_train, X_valid]), np.concatenate([y_train, valid_df.target.values]))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

Make test prediction:

In [43]:
test_norm = pd.read_csv('test_norm.csv', sep='\t', index_col='id', converters={'name': conv, 'description': conv})

In [44]:
%%time
X_test = pipe.fit_transform(test_norm)

Wall time: 9min 38s


In [45]:
test_pred = clf.predict_proba(X_test)[:,1]

In [46]:
sub = pd.DataFrame({'target': test_pred}, index = test_norm.index.values)
sub.index = sub.index.set_names('id')

In [47]:
sub.to_csv('w2v_cbow_100_ns5_rf_1000_ckw2.csv')