# Team 2

- Michael Tomilov
- Roman Romanov
- Yura Khaltin

# Imports

In [1]:
import bs4
import os
import pickle
import time
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pymorphy2
from gensim.models import KeyedVectors

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

import texts2counters as t2c

FOLDER_PATH = '/home/mtomilov/Downloads/sfml/week5/all/{}.{}'
RND_SEED = 123

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

morph = pymorphy2.MorphAnalyzer()

INFO:pymorphy2.opencorpora_dict.wrapper:Loading dictionaries from /home/mtomilov/.local/lib/python3.6/site-packages/pymorphy2_dicts/data
INFO:pymorphy2.opencorpora_dict.wrapper:format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


# Data Read

In [12]:
df_test = pd.read_csv(FOLDER_PATH.format('test.csv'), sep='\t', encoding='utf8')

In [2]:
df_train = pd.read_csv(FOLDER_PATH.format('train.csv'), sep='\t', encoding='utf8')

In [4]:
df_train.head()

Unnamed: 0,id,name,description,target
0,0,Заведующий отделом/секцией в магазин YORK (Уру...,<p><strong>В НОВЫЙ МАГАЗИН YORK (хозтовары) пр...,1
1,1,Наладчик станков и манипуляторов с ПУ,Обязанности:работа на токарных станках с ЧПУ T...,0
2,2,Разработчик С++ (Криптограф),<strong>Требования:</strong> <ul> <li>Опыт про...,0
3,3,Фрезеровщик,<p>Условия:</p> <ul> <li>На работу вахтовым ме...,0
4,4,Мерчендайзер/продавец-консультант,<p><strong>Компания Палладиум Стандарт - призн...,1


# Data Preprocessing

In [6]:
# model_w2v = KeyedVectors.load_word2vec_format('~/Downloads/sfml/news_upos_cbow_600_2_2018.vec.gz')

train.npy and test.npy это посчитанные word2vec вектора  
Считаются скриптом word_vectors.py

In [11]:
X_final_test = np.load(FOLDER_PATH.format('test.npy'))

In [13]:
np.nansum(X_final_test)

10316.856

In [3]:
X = np.load(FOLDER_PATH.format('train.npy'))

In [4]:
np.nansum(X)

12133.296

In [5]:
y = df_train['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression

In [7]:
logistic_model_basic = LogisticRegression()
logistic_model_basic.fit(X_train, y_train)
y_hat = logistic_model_basic.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_hat)



0.9735641506832877

In [8]:
logistic_model_basic = LogisticRegression(C=1000.)
logistic_model_basic.fit(X_train, y_train)
y_hat = logistic_model_basic.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_hat)



0.9806271077489068

In [102]:
# explodes
logistic_model = LogisticRegression()
logistic_params = {
    'penalty': ['l1','l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000]
#     'C': [0.1, 1, 10]
}

cv = StratifiedKFold(n_splits=5, random_state=RND_SEED, shuffle=True)
logistic_search = GridSearchCV(logistic_model, param_grid=logistic_params, n_jobs=-1,
                                               cv=cv, scoring='roc_auc', verbose=True)
logistic_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.9s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=True)

In [103]:
logistic_search.best_params_

{'C': 1000, 'penalty': 'l2'}

In [104]:
logistic_search.best_score_

0.9694638098256991

# Stochastic Gradient Descent (Logistic Elastic Net)

In [9]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', alpha=0.0001, l1_ratio=0.5)

In [10]:
sgd.fit(X_train, y_train)
y_hat = sgd.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_hat)



0.9629603191068422

In [108]:
# explodes
sgd_params = {
    'l1_ratio': [0.001, 0.1, 0.3, 0.6, 0.9, 0.99],
    'alpha': np.logspace(-3, 5, 20),
}

cv = StratifiedKFold(n_splits=5, random_state=RND_SEED, shuffle=True)
sgd_search = GridSearchCV(sgd, param_grid=sgd_params, n_jobs=-1,
                          cv=cv, scoring='roc_auc', verbose=True)
sgd_search.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   29.8s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
       error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.5, learning_rate='optimal', loss='log', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'l1_ratio': [0.001, 0.1, 0.3, 0.6, 0.9, 0.99], 'alpha': array([1.00000e-03, 2.63665e-03, 6.95193e-03, 1.83298e-02, 4.83293e-02,
       1.27427e-01, 3.35982e-01, 8.85867e-01, 2.33572e+00, 6.15848e+00,
       1.62378e+01, 4.28133e+01, 1.12884e+02, 2.97635e+02, 7.84760e+02,
       2.06914e+03, 5.45559e+03, 1.43845e+04, 3.79269e+04, 1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, re

In [109]:
sgd_search.best_params_

{'alpha': 0.001, 'l1_ratio': 0.001}

In [110]:
sgd_search.best_score_

0.935010511710573

# Decision Tree

In [29]:
tree_params = {
    'class_weight': 'balanced',
    'criterion': 'entropy',
    'max_depth': 6,
    'max_features': None,
    'min_samples_leaf': 3
}

In [30]:
tree = DecisionTreeClassifier(**tree_params)
tree.fit(X_train, y_train)

NameError: name 'best_tree' is not defined

In [31]:
y_hat = tree.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_hat)

0.9428033169037054

# Data Save

In [18]:
final_ids = df_test.id

In [23]:
y_final_hat = logistic_model_basic.predict_proba(X_final_test)[:, 1]

In [26]:
df_predict = pd.DataFrame({'id': final_ids, 'target': y_final_hat})

In [27]:
df_predict.to_csv(FOLDER_PATH.format('submission.csv'), sep=',', encoding='utf8', index=False)