In [1]:
import pandas as pd
import numpy as np
import pickle

train = pd.read_csv('../data/train.csv')
dev = pd.read_csv('../data/dev.csv')
test = pd.read_csv('../data/test_no_label.csv')
train_dev = train.append(dev)

In [2]:
# all of this code is taken from other notebooks and modified for combining train and dev

In [3]:
train_ex = list(train_dev.ex_id)
test_ex = list(test.ex_id)
combined = pd.concat([train_dev, test], axis=0)

def engineered_df(df):
    rolling_rev = []
    user_dict = {}
    for index,row in df.iterrows():
        curr_date = row['date']
        curr_user = row['user_id']
        
        if(curr_user not in user_dict):
            dates = df.loc[df.user_id == curr_user,'date'].tolist()
            dates.sort()
            user_dict[curr_user] = dates
        index = user_dict[curr_user].index(curr_date)
        
        rolling_rev.append(index)
        
    df['reviewsToDate'] = rolling_rev
    return df

combined = engineered_df(combined)

train_dev = combined[combined['ex_id'].isin(train_ex)]
test = combined[combined['ex_id'].isin(test_ex)]

In [4]:
def downsample(df, pct_pos):
    ''' 
    Borrowed from earlier project: https://github.com/kelseymarkey/
    cook-county-mental-health-prediction/blob/master/Final_Data_Prep.py

    takes in df and a percentage from 1 to 50
    samples all label==1 cases, then samples from label==0 cases 
    until downsampled_df has pct_pos % positive cases, returns indices.
    '''
    # split into df by label
    label_1 = df[df['label'] == 1]
    label_0 = df[df['label'] == 0]

    #count number of pos
    count_label_1 = len(label_1)

    #compute number of negative cases to sample
    num_label_0 = count_label_1 * int(round((100 - pct_pos) / pct_pos))

    #sample from negative cases
    label_0_sample = label_0.sample(n=num_label_0, random_state=22)

    #append sampled negative cases to all positive cases
    downsampled_df = label_1.append(label_0_sample)

    return list(downsampled_df.index)

In [5]:
downsampled_idx_train = downsample(train_dev, 50)

In [6]:
train_dev = train_dev.loc[downsampled_idx_train]

In [7]:
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

class StemmedDict(dict):
    def __missing__(self, key):
        res = self[key] = port.stem(key)
        return res

stemmed = StemmedDict()
port = PorterStemmer()
analyzer_2 = CountVectorizer(stop_words='english',
                             ngram_range=(1, 2)).build_analyzer()
analyzer_3 = CountVectorizer(stop_words='english',
                             ngram_range=(1, 3)).build_analyzer()

def stem_words_2(doc):
    return [' '.join([stemmed[word] for word in ngram.split()])
            for ngram in analyzer_2(doc)]

def stem_words_3(doc):
    return [' '.join([stemmed[word] for word in ngram.split()])
            for ngram in analyzer_3(doc)]

In [8]:
tf_2 = TfidfVectorizer(analyzer=stem_words_2)
tf_3 = TfidfVectorizer(analyzer=stem_words_3)

In [9]:
tf_2.fit(train_dev['review'])
tf_3.fit(train_dev['review'])

TfidfVectorizer(analyzer=<function stem_words_3 at 0x000001890D8CA950>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [10]:
traindev_transformed_2 = tf_2.transform(train_dev['review'])
print(f'finished tf_2')

with open(f'../data/traindev_tfidf_vectorized_2.pckl', 'wb') as f:
    pickle.dump(traindev_transformed_2, f)
    
traindev_transformed_3 = tf_3.transform(train_dev['review'])
print(f'finished tf_3')

with open(f'../data/traindev_tfidf_vectorized_3.pckl', 'wb') as f:
    pickle.dump(traindev_transformed_3, f)

finished tf_2
finished tf_3


In [11]:
num_exclam = pd.Series(train_dev['review'].str.count('!'))

num_caps = (train_dev['review'].str.extractall(r'(\b[A-Z]{2,}\b)') # extract all capitalized words with len >= 2
                               .groupby(level=0).size()                         # count by each index
                               .reindex(train_dev['review'].index, fill_value=0))     # fill the missing with 0

In [12]:
train_dev_rating = train_dev['rating']

In [13]:
traindev_transformed_3.shape, train_dev_rating.shape

((73482, 4047741), (73482,))

In [14]:
from scipy import sparse

caps = num_caps.copy()
exclam = num_exclam.copy()
rev_counts = train_dev['reviewsToDate'].copy()
ratings = train_dev_rating.copy()

caps = caps.values.reshape(-1, 1)
exclam = exclam.values.reshape(-1, 1)
rev_counts = rev_counts.values.reshape(-1, 1)
ratings = ratings.values.reshape(-1, 1)

In [15]:
traindev_transformed_2.shape, caps.shape, exclam.shape, rev_counts.shape, ratings.shape, traindev_transformed_3.shape

((73482, 1240492),
 (73482, 1),
 (73482, 1),
 (73482, 1),
 (73482, 1),
 (73482, 4047741))

In [16]:
full_data_2 = sparse.hstack((traindev_transformed_2,
                             sparse.csr_matrix(caps),
                             sparse.csr_matrix(exclam),
                             sparse.csr_matrix(rev_counts),
                             sparse.csr_matrix(ratings),
                             ))

full_data_3 = sparse.hstack((traindev_transformed_3,
                             sparse.csr_matrix(caps),
                             sparse.csr_matrix(exclam),
                             sparse.csr_matrix(rev_counts),
                             sparse.csr_matrix(ratings),
                             ))

# for some reason, this isn't an actual CSR matrix...
full_data_2 = full_data_2.tocsr()
full_data_3 = full_data_3.tocsr()

sums = (traindev_transformed_2 > 0).sum(axis=0)
lim = sums.mean()
print(np.asarray(sums > lim)[0].sum())
sliced_data = full_data_2[:, np.asarray(sums > lim)[0]]

with open(f'../data/traindev_tfidf_subsampled_data_3.pckl', 'wb') as f:
    pickle.dump(full_data_3, f)

with open(f'../data/traindev_tfidf_subsampled_data_sliced_2.pckl', 'wb') as f:
    pickle.dump(sliced_data, f)

98489


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB

with open('../data/best-lr-tfidf.pckl', 'rb') as f:
    lr = eval(pickle.load(f).__str__())

with open('../data/best_svm_tfidf.pckl', 'rb') as f:
    svm = eval(pickle.load(f).__str__())

with open('../data/best_mnb_tfidf.pckl', 'rb') as f:
    mnb = eval(pickle.load(f).__str__())

In [18]:
full_data_3.shape, sliced_data.shape

((73482, 4047745), (73482, 98489))

In [19]:
lr.fit(sliced_data, train_dev['label'])

LogisticRegression(C=0.49500509740190507, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
svm.fit(full_data_3, train_dev['label'])



CalibratedClassifierCV(base_estimator=LinearSVC(C=0.020329421573461483,
                                                class_weight=None, dual=True,
                                                fit_intercept=True,
                                                intercept_scaling=1,
                                                loss='squared_hinge',
                                                max_iter=1000,
                                                multi_class='ovr', penalty='l2',
                                                random_state=None, tol=0.0001,
                                                verbose=0),
                       cv=None, method='sigmoid')

In [21]:
mnb.fit(full_data_3, train_dev['label'])

MultinomialNB(alpha=0.17693089816649998, class_prior=None, fit_prior=True)

In [22]:
with open('../data/best_lr_retrained_tfidf.pckl', 'wb') as f:
    pickle.dump(lr, f)

with open('../data/best_svm_retrained_tfidf.pckl', 'wb') as f:
    pickle.dump(svm, f)

with open('../data/best_mnb_retrained_tfidf.pckl', 'wb') as f:
    pickle.dump(mnb, f)

In [23]:
num_exclam_test = pd.Series(test['review'].str.count('!'))

num_caps_test = (test['review'].str.extractall(r'(\b[A-Z]{2,}\b)') # extract all capitalized words with len >= 2
                               .groupby(level=0).size()                         # count by each index
                               .reindex(test['review'].index, fill_value=0))     # fill the missing with 0

In [24]:
test_rating = test['rating']

In [26]:
test_transformed_2 = tf_2.transform(test['review'])
print(f'finished tf_2')

with open(f'../data/test_tfidf_vectorized_2.pckl', 'wb') as f:
    pickle.dump(test_transformed_2, f)
    
test_transformed_3 = tf_3.transform(test['review'])
print(f'finished tf_3')

with open(f'../data/test_tfidf_vectorized_3.pckl', 'wb') as f:
    pickle.dump(test_transformed_3, f)

finished tf_2
finished tf_3


In [27]:
caps = num_caps_test.copy()
exclam = num_exclam_test.copy()
rev_counts = test['reviewsToDate'].copy()
ratings = test_rating.copy()

caps = caps.values.reshape(-1, 1)
exclam = exclam.values.reshape(-1, 1)
rev_counts = rev_counts.values.reshape(-1, 1)
ratings = ratings.values.reshape(-1, 1)

In [28]:
test_transformed_2.shape, caps.shape, exclam.shape, rev_counts.shape, ratings.shape, test_transformed_3.shape

((72165, 1240492),
 (72165, 1),
 (72165, 1),
 (72165, 1),
 (72165, 1),
 (72165, 4047741))

In [29]:
full_data_2 = sparse.hstack((test_transformed_2,
                             sparse.csr_matrix(caps),
                             sparse.csr_matrix(exclam),
                             sparse.csr_matrix(rev_counts),
                             sparse.csr_matrix(ratings),
                             ))

full_data_3 = sparse.hstack((test_transformed_3,
                             sparse.csr_matrix(caps),
                             sparse.csr_matrix(exclam),
                             sparse.csr_matrix(rev_counts),
                             sparse.csr_matrix(ratings),
                             ))

# for some reason, this isn't an actual CSR matrix...
full_data_2 = full_data_2.tocsr()
full_data_3 = full_data_3.tocsr()

sums = (traindev_transformed_2 > 0).sum(axis=0)
lim = sums.mean()
print(np.asarray(sums > lim)[0].sum())
sliced_data = full_data_2[:, np.asarray(sums > lim)[0]]

with open(f'../data/test_tfidf_subsampled_data_3.pckl', 'wb') as f:
    pickle.dump(full_data_3, f)

with open(f'../data/test_tfidf_subsampled_data_sliced_2.pckl', 'wb') as f:
    pickle.dump(sliced_data, f)

98489
