In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import pandas as pd
import numpy as np
import pickle

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
dev = pd.read_csv('../data/dev.csv')

In [5]:
train.head()['review'].tolist()

["The food at snack is a selection of popular Greek dishes. The appetizer tray is good as is the Greek salad. We were underwhelmed with the main courses. There are 4-5 tables here so it's sometimes hard to get seated.",
 "This little place in Soho is wonderful. I had a lamb sandwich and a glass of wine. The price shocked me for how small the serving was, but then again, this is Soho. The staff can be a little snotty and rude, but the food is great, just don't expect world-class service.",
 'ordered lunch for 15 from Snack last Friday. \xa0On time, nothing missing and the food was great. \xa0I have added it to the regular company lunch list, as everyone enjoyed their meal.',
 "This is a beautiful quaint little restaurant on a pretty street. \xa0If you're strolling through soho around lunchtime, this would be a great place to stop for a bite. I heard the reviews about the lamb sandwich, so I had to try it. I'm very happy that i did. Every single ingredient - from the bread to the onions 

In [6]:
train.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...


In [4]:
with open('../data/idx_train.pckl', 'rb') as f:
    indices = pickle.load(f)

In [5]:
train = train.loc[indices]

In [6]:
from nltk.stem import PorterStemmer

class StemmedDict(dict):
    def __missing__(self, key):
        res = self[key] = port.stem(key)
        return res

stemmed = StemmedDict()
port = PorterStemmer()
analyzer = CountVectorizer(stop_words='english',
                           ngram_range=(1, 2)).build_analyzer()

def stem_words(doc):
    return [' '.join([stemmed[word] for word in ngram.split()])
            for ngram in analyzer(doc)]

In [7]:
n_feature_hashes = 2 ** 23  # we have at least 13m ngrams so we have a lot of collisions if taking 2^20

cv = CountVectorizer(analyzer=stem_words)
tf = TfidfVectorizer(analyzer=stem_words)
hs = HashingVectorizer(analyzer=stem_words,
                       n_features=n_feature_hashes)
bn = CountVectorizer(analyzer=stem_words, binary=True)  # binary version
hs_bn = HashingVectorizer(analyzer=stem_words,
                          binary=True,
                          n_features=n_feature_hashes)

vectorizers = {'count': cv,
               'tfidf': tf,
               #'hashing': hs,
               'binary': bn,
               #'hashing_binary': hs_bn,
               }

In [None]:
for name, vectorizer in vectorizers.items():
    train_transformed = vectorizer.fit_transform(train['review'])
    print(f'finished {name}')

    with open(f'../data/train_{name}_vectorized.pckl', 'wb') as f:
        pickle.dump(train_transformed, f)
    del train_transformed

finished count
finished tfidf


In [None]:
for name, vectorizer in vectorizers.items():
    dev_transformed = vectorizer.transform(dev['review'])
    print(f'finished {name}')

    with open(f'../data/dev_{name}_vectorized.pckl', 'wb') as f:
        pickle.dump(dev_transformed, f)
    del dev_transformed

In [None]:
with open('../data/train_labels.pckl', 'wb') as f:
    pickle.dump(train['label'], f)
    
with open('../data/dev_labels.pckl', 'wb') as f:
    pickle.dump(dev['label'], f)

In [None]:
with open('../data/vectorizers.pckl', 'wb') as f:
    # pickle.dump(vectorizers, f)
    pass  # ends up being a 2gb file. lol