## Задание
Необходимо по тексту определить: лучше публиковать его в блоге на Хабрахабр или на Geektimes, другими словами нужно научить алгоритм отличать статьи одного блога от другого. Подразумевается, что текст технический и релевантен тематике данных блогов.

В качестве исходных данных используются два json файла с 1000 текстами с каждого из этих двух сайтов.

In [None]:
import os
import json
import re
from os.path import join
import pandas as pd
import numpy as np

from string import punctuation
from nltk.stem.snowball import RussianStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
RND_SEED = 0
PATH_TO_DATA = "data/habr/"

specSymb = {"«", "»", "—", "“", "-", "№"}
specSymb = punctuation + "«»—“-№"
pattern = re.compile("[" + re.escape(specSymb) + "]")

stopRus = stopwords.words('russian')
stemmer_rus = SnowballStemmer("russian")
stemmer_eng = SnowballStemmer("english")

## Load data

In [None]:
with open(join(PATH_TO_DATA, "geektimes.json")) as f_in:
    jsonka_geek = json.loads(f_in.read())

df_geek = pd.DataFrame.from_dict(jsonka_geek)
df_geek["label"] = pd.Series([0]*len(df_geek))
print("Size of geektimes sample: ", df_geek.shape[0])


with open(join(PATH_TO_DATA, "habrahabr.json")) as f_in:
    jsonka_habr = json.loads(f_in.read())

df_habr = pd.DataFrame.from_dict(jsonka_habr)
df_habr["label"] = pd.Series([1]*len(df_habr))
print("Size of habrahabr sample: ", df_habr.shape[0])


In [None]:
df_all = pd.concat([df_habr, df_geek])
df_all = df_all.reset_index()[["title", "text", "label"]]

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(pattern, '', text)
    text = text.replace('ё', 'е')

    tokens = word_tokenize(text)
    tokens_without_nums = filter(lambda x: not x.isdigit(), tokens)  # skip all numbers
    tokens_without_stop = filter(lambda x: x not in stopRus, tokens_without_nums)
    tokens_stem_rus = map(lambda x: stemmer_rus.stem(x), tokens_without_stop)
    tokens_stem_eng = map(lambda x: stemmer_eng.stem(x), tokens_stem_rus)
    
    return " ".join(tokens_stem_eng)

In [None]:
%%time
df_all["title"] = df_all["title"].apply(preprocess)
df_all["text"] = df_all["text"].apply(preprocess)

In [None]:
X = df_all["title"].str.cat(pd.Series([' ']*len(df_all))).str.cat(df_all["text"])
y = df_all["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = RND_SEED)

In [None]:
X_train.shape[0], X_test.shape[0]

## Define vectorizer and model

In [None]:
vect = CountVectorizer(binary=True)
# попробуйте разные типы векторизаторов бинарный, CountVectorizer, TfIdfVectorizer, HashingVectorizer

In [None]:
logreg = LogisticRegression(C=1, solver="liblinear")  # посмотрите, как влияет коэффициент и тип регуляризации

In [None]:
def plot_roc_auc(y_score, y_test):
    fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
    roc_auc = metrics.auc(fpr, tpr)
    f1 = plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc, linewidth=3)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    plt.close()

In [None]:
def fit_model(X_train, X_test, y_train, y_test, model, vectorizer):
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)

    print("precision: %s;\nrecall %s" % (precision, recall))
    
    y_score = model.decision_function(X_test)
    plot_roc_auc(y_score, y_test)

In [None]:
%%time
fit_model(X_train, X_test, y_train, y_test, logreg, vect)

## Get prediction for new text

In [None]:
def getPrediction(inputTextFile, vectorizer, model):
    """Perform prediction for inputTextFile: on what site this text shoulb be published - GeekTimes or Habrahabr."""
    readFile = open(inputTextFile, encoding='utf8').read()
    t = " ".join(readFile.split())
    proc = preprocess(t)
    vectorize = vectorizer.transform([proc])
    flag = (model.predict_proba(vectorize)[0] == max(model.predict_proba(vectorize)[0]))[0]
    blog = flag and u"GeekTimes" or u"Habrahabr"
    print ("C вероятностью {0} % данный текст опубликован на".format(round(max(model.predict_proba(vectorize)[0])*100,2)), blog)

In [None]:
getPrediction("data/habr/input.txt", vect, logreg)

##  Попробуем vowpal wabbit

Prepare sample for vw

In [None]:
with open(join(PATH_TO_DATA, "geek_habr_sample_train.vw"), "w") as f_out:
    for idx in X_train.index:
        row = df_all.loc[idx]
        label = row.label
        title = row.title
        text = row.text
        
        if label == 0:
            label = -1 ## AS VW EXSPECTS
            
        line = str(label) + " |title " + title + " |text " + text + "\n"
        f_out.write(line)
        
        
with open(join(PATH_TO_DATA, "geek_habr_sample_test.vw"), "w") as f_out:
    for idx in X_test.index:
        row = df_all.loc[idx]
        label = row.label
        title = row.title
        text = row.text
        
        if label == 0:
            label = -1 ## AS VW EXSPECTS
            
        line = str(label) + " |title " + title + " |text " + text + "\n"
        f_out.write(line)


## Learn with vowpal wabbit

In [None]:
!vw -d data/habr/geek_habr_sample_train.vw --loss_function logistic -f data/habr/vw_trained.model \
                    --ngram 2

In [None]:
!vw -i data/habr/vw_trained.model -t -d data/habr/geek_habr_sample_test.vw \
                                        -p data/habr/vw_predictions.txt

In [None]:
with open('data/habr/vw_predictions.txt') as pred_file:
    y_score_vw = [float(label)
                             for label in pred_file.readlines()]


plot_roc_auc(y_score_vw, y_test)

In [None]:
precision = metrics.precision_score(y_test, y_score_vw)
recall = metrics.recall_score(y_test, y_score_vw)

In [None]:
y_score_vw[:5]

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

y_score_vw_normalized = sigmoid(np.array(y_score_vw))
y_score_vw_normalized = y_score_vw_normalized > 0.55

In [None]:
precision = metrics.precision_score(y_test, y_score_vw_normalized)
recall = metrics.recall_score(y_test, y_score_vw_normalized)
print("precision: %s;\nrecall %s" % (precision, recall))