In [2]:
import pandas as pd
import numpy as np
import pickle

train = pd.read_csv('../data/train.csv')
dev = pd.read_csv('../data/dev.csv')
test = pd.read_csv('../data/test_no_label.csv')

In [None]:
# all of this code is taken from other notebooks and modified for working with 'test' instead of dev

In [3]:
num_exclam_test = pd.Series(test['review'].str.count('!'))

num_caps_test = (test['review'].str.extractall(r'(\b[A-Z]{2,}\b)') # extract all capitalized words with len >= 2
                               .groupby(level=0).size()                         # count by each index
                               .reindex(test['review'].index, fill_value=0))     # fill the missing with 0

In [4]:
train_ex = list(train.ex_id)
dev_ex = list(dev.ex_id)
test_ex = list(test.ex_id)
combined = pd.concat([train, dev, test],axis=0)

def engineered_df(df):
    rolling_rev = []
    user_dict = {}
    for index,row in df.iterrows():
        curr_date = row['date']
        curr_user = row['user_id']
        
        if(curr_user not in user_dict):
            dates = df.loc[df.user_id == curr_user,'date'].tolist()
            dates.sort()
            user_dict[curr_user] = dates
        index = user_dict[curr_user].index(curr_date)
        
        rolling_rev.append(index)
        
    df['reviewsToDate'] = rolling_rev
    return df

combined = engineered_df(combined)

train = combined[combined['ex_id'].isin(train_ex)]
dev = combined[combined['ex_id'].isin(dev_ex)]
test = combined[combined['ex_id'].isin(test_ex)]

In [10]:
test_rating = test['rating']

In [8]:
with open('../data/idx_train.pckl', 'rb') as f:
    indices = pickle.load(f)

In [9]:
train = train.loc[indices]

In [12]:
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

class StemmedDict(dict):
    def __missing__(self, key):
        res = self[key] = port.stem(key)
        return res

stemmed = StemmedDict()
port = PorterStemmer()
analyzer = CountVectorizer(stop_words='english',
                           ngram_range=(1, 2)).build_analyzer()

def stem_words(doc):
    return [' '.join([stemmed[word] for word in ngram.split()])
            for ngram in analyzer(doc)]

In [13]:
tf = TfidfVectorizer(analyzer=stem_words)

vectorizers = {'tfidf': tf,
               }

In [14]:
for name, vectorizer in vectorizers.items():
    train_transformed = vectorizer.fit_transform(train['review'])
    print(f'finished {name}')

    del train_transformed

finished tfidf


In [15]:
for name, vectorizer in vectorizers.items():
    test_transformed = vectorizer.transform(test['review'])
    print(f'finished {name}')

    with open(f'../data/test_{name}_vectorized.pckl', 'wb') as f:
        pickle.dump(test_transformed, f)

finished tfidf


In [19]:
with open(f'../data/train_tfidf_subsampled_data.pckl', 'rb') as f:
    train_vectorized = pickle.load(f)

In [22]:
from scipy import sparse

vectorized_data = test_transformed
caps = num_caps_test
exclam = num_exclam_test
rev_counts = test['reviewsToDate']
ratings = test_rating

caps = caps.values.reshape(-1, 1)
exclam = exclam.values.reshape(-1, 1)
rev_counts = rev_counts.values.reshape(-1, 1)
ratings = ratings.values.reshape(-1, 1)

full_data = sparse.hstack((vectorized_data,
                           sparse.csr_matrix(caps),
                           sparse.csr_matrix(exclam),
                           sparse.csr_matrix(rev_counts),
                           sparse.csr_matrix(ratings),
                           ))

# for some reason, this isn't an actual CSR matrix...
full_data = full_data.tocsr()

sums = (train_vectorized > 0).sum(axis=0)
lim = sums.mean()
print(np.asarray(sums > lim)[0].sum())
sliced_data = full_data[:, np.asarray(sums > lim)[0]]

with open(f'../data/test_tfidf_subsampled_data_sliced.pckl', 'wb') as f:
    pickle.dump(sliced_data, f)

83235
