In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import pandas as pd
import numpy as np
import pickle

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
dev = pd.read_csv('../data/dev.csv')

In [None]:
with open('../data/idx_train.pckl', 'rb') as f:
    indices = pickle.load(f)

In [None]:
train = train.loc[indices]

In [4]:
from nltk.stem import PorterStemmer

class StemmedDict(dict):
    def __missing__(self, key):
        res = self[key] = port.stem(key)
        return res

stemmed = StemmedDict()
port = PorterStemmer()
analyzer = CountVectorizer(stop_words='english',
                           ngram_range=(1, 3)).build_analyzer()

def stem_words(doc):
    return [' '.join([stemmed[word] for word in ngram.split()])
            for ngram in analyzer(doc)]

In [5]:
n_feature_hashes = 2 ** 23  # we have at least 13m ngrams so we have a lot of collisions if taking 2^20

cv = CountVectorizer(analyzer=stem_words)
tf = TfidfVectorizer(analyzer=stem_words)
hs = HashingVectorizer(analyzer=stem_words,
                       n_features=n_feature_hashes)
bn = CountVectorizer(analyzer=stem_words, binary=True)  # binary version
hs_bn = HashingVectorizer(analyzer=stem_words,
                          binary=True,
                          n_features=n_feature_hashes)

vectorizers = {'count': cv,
               'tfidf': tf,
               'hashing': hs,
               'binary': bn,
               'hashing_binary': hs_bn,
               }

In [6]:
for name, vectorizer in vectorizers.items():
    train_transformed = vectorizer.fit_transform(train['review'])
    print(f'finished {name}')

    with open(f'../data/train_{name}_vectorized.pckl', 'wb') as f:
        pickle.dump(train_transformed, f)
    del train_transformed

finished count
finished tfidf
finished hashing
finished binary
finished hashing_binary


In [7]:
for name, vectorizer in vectorizers.items():
    dev_transformed = vectorizer.transform(dev['review'])
    print(f'finished {name}')

    with open(f'../data/dev_{name}_vectorized.pckl', 'wb') as f:
        pickle.dump(dev_transformed, f)
    del dev_transformed

finished count
finished tfidf
finished hashing
finished binary
finished hashing_binary


In [8]:
with open('../data/train_labels.pckl', 'wb') as f:
    pickle.dump(train['label'], f)
    
with open('../data/dev_labels.pckl', 'wb') as f:
    pickle.dump(dev['label'], f)

In [9]:
with open('../data/vectorizers.pckl', 'wb') as f:
    # pickle.dump(vectorizers, f)
    pass  # ends up being a 2gb file. lol