# Preprocessing and exploring word embeddings

#### Dependencies

In [22]:
import pandas as pd
import eli5
import matplotlib.pyplot as plit
%matplotlib inline
from eli5.sklearn import InvertableHashingVectorizer
import numpy as np


from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBClassifier
from gensim.models import KeyedVectors
import operator


import re

import sys
sys.path.append('../src')

%load_ext autoreload
%autoreload 1

from datatasks.sample_data import sample_data
%aimport datatasks.sample_data

from models.models import evaluate_model

from models.plot import plot_confusion_matrix

from models.pipeline import make_features_pipeline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load data

In [36]:
DATA_PATH = '../data/'
DATA_INTERIM_PATH = DATA_PATH + 'interim/'
train = pd.read_csv(DATA_INTERIM_PATH + 'train_p.csv')
val = pd.read_csv(DATA_INTERIM_PATH + 'val_p.csv')

KeyboardInterrupt: 

#### Sample data

In [None]:
train_s = sample_data(train, 70000, 'train')
val_s = sample_data(val, 10000, 'val')

In [None]:
y_train = train_s['hyperpartisan']
y_test = val_s['hyperpartisan']

#### Mask numbers

In [None]:
def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

train_s['preprocessed_text'] = train_s['preprocessed_text'].apply(clean_numbers)
val_s['preprocessed_text'] = val_s['preprocessed_text'].apply(clean_numbers)

#### Custom stopwords

In [None]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(["jan", 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
                                               'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september',
                                               'october', 'november', 'december', 'monday', 'tuesday', 'wednesday', 'thursday',
                                               'friday', 'saturday', 'sunday', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                                              'advertisement', 'said', 'image', 'year'])

In [None]:
vec = HashingVectorizer(norm='l2', ngram_range=(1,2), stop_words=my_stop_words)
clf = SGDClassifier()
feats = make_features_pipeline(vec, 'preprocessed_text')

pipeline = Pipeline([
    ('feats', feats),
    ('clf', clf)
])

In [None]:
pipeline.fit(train_s, y_train)

In [None]:
predicted = pipeline.predict(val_s)
evaluate_model(predicted, y_test)

In [None]:
plot_confusion_matrix(y_test, predicted)

## Word embeddings

In [21]:
news_path = '../data/external/GoogleNews-vectors-negative300.bin.gz'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

#### Check vocab coverage