# Converting project to Gensim best pratices

#### Dependencies

In [20]:
import logging
import gensim
from gensim import corpora
from pprint import pprint
from gensim.models import TfidfModel
from gensim.utils import simple_preprocess
from gensim.sklearn_api import TfIdfTransformer
from smart_open import smart_open
import os
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
import pickle
import glob
import itertools
from gensim.sklearn_api import TfIdfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append('../src')

%load_ext autoreload
%autoreload 1

from models.models import evaluate_model

from models.plot import plot_LSA, plot_confusion_matrix

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load data and get token lists from dataframe

This only needs to be run once. Token lists will be saved in order to take advantage of Gensim's loading

In [4]:
DATA_PATH = '../data/'
DATA_INTERIM_PATH = DATA_PATH + 'interim/'
DATA_PROCESSED_PATH = DATA_PATH + 'processed/'

In [21]:
#train = pd.read_csv(DATA_INTERIM_PATH + 'train_p.csv', usecols=['tokens'])
#train_tokens1 = [ast.literal_eval(train['tokens'].values[i]) for i in tqdm(range(0, 149999))]

#with open(DATA_PROCESSED_PATH + "train_tokens1.txt", "wb") as internal_filename:
#    pickle.dump(train_tokens1, internal_filename)

100%|██████████| 149999/149999 [05:17<00:00, 471.75it/s]


In [3]:
#val = pd.read_csv(DATA_INTERIM_PATH + 'val_p.csv', usecols=['tokens'])
#val_tokens = [ast.literal_eval(val['tokens'].values[i]) for i in tqdm(range(0,val.shape[0]))]
#with open(DATA_PROCESSED_PATH + "val_tokens.txt", "wb") as internal_filename:
#    pickle.dump(val_tokens, internal_filename)

100%|██████████| 150000/150000 [10:40<00:00, 234.33it/s]


#### Create Gensim Dictionary and Bag-of-Words Corpus

In [47]:
#train_dictionary = corpora.Dictionary()
#fnames = list(reversed(glob.glob(DATA_PROCESSED_PATH + 'train_tokens*')))
#for fname in fnames:
#    with open(fname, "rb") as internal_filename:
#        train_tokens = pickle.load(internal_filename)
#    train_dictionary.add_documents(train_tokens)

#train_dictionary.save(DATA_PROCESSED_PATH + 'train_dict.dict')

In [5]:
class MyCorpus(object):
    
    def __init__(self, filepath, dictionary, trainorval):
        self.filepath = filepath
        self.dictionary = dictionary
        self.trainorval = trainorval
        self.fnames = list(reversed(glob.glob(filepath + trainorval + '_tokens*')))
    
    def __iter__(self):
        for fname in self.fnames:
            with open(fname, "rb") as internal_filename:
                for doc in pickle.load(internal_filename):
                    yield self.dictionary.doc2bow(doc)

#### Serialize train and val BoW

In [6]:
train_dictionary = corpora.Dictionary.load(DATA_PROCESSED_PATH + 'train_dict.dict')
train_corpus = MyCorpus(DATA_PROCESSED_PATH, train_dictionary, 'train')
gensim.corpora.MmCorpus.serialize(DATA_PROCESSED_PATH + 'train_bow.mm', train_corpus)

In [15]:
val_corpus = MyCorpus(DATA_PROCESSED_PATH, train_dictionary, 'val')
gensim.corpora.MmCorpus.serialize(DATA_PROCESSED_PATH + 'val_bow.mm', train_corpus)

#### Load BoW

In [17]:
train_mm_corpus = gensim.corpora.MmCorpus(DATA_PROCESSED_PATH + 'train_bow.mm')
val_mm_corpus = gensim.corpora.MmCorpus(DATA_PROCESSED_PATH + 'val_bow.mm')

#### Load targets

In [18]:
y_train = pd.read_csv(DATA_INTERIM_PATH + 'train_p.csv', usecols=['hyperpartisan'])
y_test = pd.read_csv(DATA_INTERIM_PATH + 'val_p.csv', usecols=['hyperpartisan'])

#### TF-IDF

In [21]:
tfidf = TfIdfTransformer(dictionary=train_dictionary)
X_train = tfidf.fit_transform(train_mm_corpus)



#### Fit Classifier

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train['hyperpartisan'].tolist())

#### Predict and Evaluate

In [None]:
X_test = tfidf.transform(val_mm_corpus)

In [None]:
preds = clf.predict(X_test, y_test['hyperpartisan'].tolist())

In [None]:
evaluate_model(y_test, preds)

In [None]:
plot_confusion_matrix(y_test, preds)