After observing poor model performance across multiple models, we sought to investigate whether there were differences between the training and dev set that caused this poor performance. To test this we took the training set, split it 80/20, vectorized train and dev separately, and downsampled train (adjusting the size of dev appropriately). 

This notebook outlines those steps, editing and executing all pre-processing steps from each of the individual notebooks in order to get a train and "dev" set that each come from the given training set. The last cell is a Naive Bayes model that shows that there even on the split training set, there are no significant differences in model performance. This means that the poor model results are not from differences between the training and dev sets but likely instead because this is just a difficult classification problem. 

In [1]:
# Pre_vectorization_feature_engineering.ipynb

import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split

train = pd.read_csv('../data/train.csv')

# New series which has number of exclamation points in each review
num_exclam_train = pd.Series(train['review'].str.count('!'))

num_caps_train = (train['review'].str.extractall(r'(\b[A-Z]{2,}\b)') # extract all capitalized words with len >= 2
                .groupby(level=0).size()                             # count by each index
                .reindex(train['review'].index, fill_value=0))       # fill the missing with 0   

with open('../data/train_num_exclam.pckl', 'wb') as f:
    pickle.dump(num_exclam_train, f)
    
with open('../data/train_num_caps.pckl', 'wb') as f:
    pickle.dump(num_caps_train, f)

In [2]:
# FeatureTransformation.ipynb

train = pd.read_csv('../data/train.csv')

def engineered_df(df):
    rolling_rev = []
    user_dict = {}
    for index,row in df.iterrows():
        curr_date = row['date']
        curr_user = row['user_id']
        
        if(curr_user not in user_dict):
            dates = df.loc[df.user_id == curr_user,'date'].tolist()
            dates.sort()
            user_dict[curr_user] = dates
        index = user_dict[curr_user].index(curr_date)
        
        rolling_rev.append(index)
        
    df['reviewsToDate'] = rolling_rev
    return df

train_engineered = engineered_df(train)

with open('../data/train_reviewsToDate.pckl', 'wb') as f:
     pickle.dump(train_engineered['reviewsToDate'], f)

In [3]:
# ratings.ipynb
train = pd.read_csv('../data/train.csv')

with open('../data/train_ratings.pckl', 'wb') as f:
    pickle.dump(train['rating'], f)

In [4]:
# Splitting
train = pd.read_csv('../data/train.csv')

train_split, dev = train_test_split(train, train_size=0.8, random_state=22)

In [5]:
print(len(train_split))
print(len(dev))

200699
50175


In [6]:
# downsample.ipynb

def downsample(df, pct_pos):
    ''' 
    Borrowed from earlier project: https://github.com/kelseymarkey/
    cook-county-mental-health-prediction/blob/master/Final_Data_Prep.py

    takes in df and a percentage from 1 to 50
    samples all label==1 cases, then samples from label==0 cases 
    until downsampled_df has pct_pos % positive cases, returns indices.
    '''
    # split into df by label
    label_1 = df[df['label'] == 1]
    label_0 = df[df['label'] == 0]

    #count number of pos
    count_label_1 = len(label_1)

    #compute number of negative cases to sample
    num_label_0 = count_label_1 * int(round((100 - pct_pos) / pct_pos))

    #sample from negative cases
    label_0_sample = label_0.sample(n=num_label_0, random_state=22)

    #append sampled negative cases to all positive cases
    downsampled_df = label_1.append(label_0_sample)

    return list(downsampled_df.index)

downsampled_idx_train = downsample(train_split, 50)

print(len(downsampled_idx_train))

41308


In [7]:
# vectorize-count.ipynb

train_ds = train_split.loc[downsampled_idx_train]
print(len(train_ds))
train_ds['label'].value_counts()

41308


1    20654
0    20654
Name: label, dtype: int64

In [8]:
# need to shrink dev to be 20% of the dataset, now that train is downsampled (and much smaller)
# to be 20% of the total it should be 1/4 the size of train_ds

n = int(0.25 * len(train_ds))
print(n)
dev_resized = dev.sample(n = n, random_state = 22)

10327


In [9]:
# vectorize-count.ipynb

from nltk.stem import PorterStemmer

class StemmedDict(dict):
    def __missing__(self, key):
        res = self[key] = port.stem(key)
        return res

stemmed = StemmedDict()
port = PorterStemmer()
analyzer = CountVectorizer(stop_words='english',
                           ngram_range=(1, 3)).build_analyzer()

def stem_words(doc):
    return [' '.join([stemmed[word] for word in ngram.split()])
            for ngram in analyzer(doc)]

In [10]:
# vectorize-count.ipynb

n_feature_hashes = 2 ** 23  # we have at least 13m ngrams so we have a lot of collisions if taking 2^20

cv = CountVectorizer(analyzer=stem_words)
tf = TfidfVectorizer(analyzer=stem_words)
bn = CountVectorizer(analyzer=stem_words, binary=True)  # binary version

vectorizers = {'count': cv,
               'tfidf': tf,
               'binary': bn,
               }

In [11]:
# vectorize-count.ipynb

for name, vectorizer in vectorizers.items():
    train_transformed = vectorizer.fit_transform(train_ds['review'])
    print(f'finished {name}')

    with open(f'../data/train_{name}_vectorized_80_20.pckl', 'wb') as f:
        pickle.dump(train_transformed, f)
    del train_transformed

finished count
finished tfidf
finished binary


In [12]:
# vectorize-count.ipynb
for name, vectorizer in vectorizers.items():
    dev_transformed = vectorizer.transform(dev_resized['review'])
    print(f'finished {name}')

    with open(f'../data/dev_{name}_vectorized_80_20.pckl', 'wb') as f:
        pickle.dump(dev_transformed, f)
    del dev_transformed

finished count
finished tfidf
finished binary


In [13]:
# vectorize-count.ipynb
with open('../data/train_labels_80_20.pckl', 'wb') as f:
    pickle.dump(train_ds['label'], f)
    
with open('../data/dev_labels_80_20.pckl', 'wb') as f:
    pickle.dump(dev_resized['label'], f)

In [14]:
# concat-features.ipynb 
from scipy import sparse

train_indices = list(train_ds.index)
dev_indices = list(dev_resized.index)
    
vectorizers = ['count', 'tfidf', 'binary']

for vectorizer in vectorizers:
    for dataset in ['train', 'dev']:
        with open(f'../data/{dataset}_{vectorizer}_vectorized_80_20.pckl', 'rb') as f:
            vectorized_data = pickle.load(f)
        
        with open(f'../data/train_num_caps.pckl', 'rb') as f:
            caps = pickle.load(f)
        
        with open(f'../data/train_num_exclam.pckl', 'rb') as f:
            exclam = pickle.load(f)
            
        with open(f'../data/train_reviewsToDate.pckl', 'rb') as f:
            rev_counts = pickle.load(f)
            
        with open(f'../data/train_ratings.pckl', 'rb') as f:
            ratings = pickle.load(f)
            
        caps = caps.values.reshape(-1, 1)
        exclam = exclam.values.reshape(-1, 1)
        rev_counts = rev_counts.values.reshape(-1, 1)
        ratings = ratings.values.reshape(-1, 1)
                    
        if dataset == 'train':
            indices = train_indices
        elif dataset == 'dev':
            indices = dev_indices   

        caps = caps[indices]
        exclam = exclam[indices]
        rev_counts = rev_counts[indices]
        ratings = ratings[indices]
        
        # more features
        
        full_data = sparse.hstack((vectorized_data,
                                   sparse.csr_matrix(caps),
                                   sparse.csr_matrix(exclam),
                                   sparse.csr_matrix(rev_counts),
                                   sparse.csr_matrix(ratings),
                                   # more features
                                   ))

        # for some reason, this isn't an actual CSR matrix...
        full_data = full_data.tocsr()

        with open(f'../data/{dataset}_{vectorizer}_subsampled_data_80_20.pckl', 'wb') as f:
            pickle.dump(full_data, f)           

## Modeling- Naive Bayes

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from sklearn.metrics import average_precision_score, roc_auc_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB

# Importing labels
with open('../data/train_labels_80_20.pckl', 'rb') as f:
    train_labels = pickle.load(f)

with open('../data/dev_labels_80_20.pckl', 'rb') as f:
    dev_labels = pickle.load(f)
    
def get_data(dataset, vectorizer):
    '''
    returns feature matrix for specified dataset and vectorizer
    @param dataset: string specifying dataset, "train","dev",etc
    @param vectorizer: string specifying vectorizer "binary","count",etc

    '''
    with open(f'../data/{dataset}_{vectorizer}_subsampled_data_80_20.pckl', 'rb') as f:
        return pickle.load(f)

In [16]:
print(len(train_labels))
print(len(dev_labels))

41308
10327


In [17]:
# specify parameters and distributions to sample from
param_dist = {'alpha': loguniform(1e-4, 1e0)}

vectorizers = ['count', 'tfidf', 'binary'] 

for vectorizer in vectorizers:
    print('----- ', vectorizer, ' -----')
    train = get_data('train', vectorizer) 
    dev = get_data('dev', vectorizer) 

    nb_multi = MultinomialNB()  

    # run randomized search
    random_search = RandomizedSearchCV(nb_multi, param_distributions=param_dist)

    random_search.fit(train, train_labels)

    nb_train = random_search.predict(train)
    nb_dev = random_search.predict(dev)
    nb_train_proba = random_search.predict_proba(train)
    nb_dev_proba = random_search.predict_proba(dev)
    
    nb_train_auc = roc_auc_score(train_labels, nb_train_proba[:, 1])
    nb_train_ap = average_precision_score(train_labels, nb_train_proba[:, 1])
    nb_train_recall = recall_score(train_labels, nb_train)
    nb_train_prec = precision_score(train_labels, nb_train)
    nb_dev_auc = roc_auc_score(dev_labels, nb_dev_proba[:, 1])
    nb_dev_ap = average_precision_score(dev_labels, nb_dev_proba[:, 1])
    nb_dev_recall = recall_score(dev_labels, nb_dev)
    nb_dev_prec = precision_score(dev_labels, nb_dev)

    print(f'Train AUC:        {nb_train_auc:.4f}\n'
          f'Train AP:         {nb_train_ap:.4f}\n'
          f'Train Precision:  {nb_train_prec:.4f}\n'
          f'Train Recall:     {nb_train_recall:.4f}\n'
          f'Dev   AUC:        {nb_dev_auc:.4f}\n'
          f'Dev   AP:         {nb_dev_ap:.4f}\n'
          f'Dev   Precision:  {nb_dev_prec:.4f}\n'
          f'Dev   Recall:     {nb_dev_recall:.4f}')

-----  count  -----
Train AUC:        0.9983
Train AP:         0.9984
Train Precision:  0.9829
Train Recall:     0.9851
Dev   AUC:        0.7380
Dev   AP:         0.2198
Dev   Precision:  0.2242
Dev   Recall:     0.4598
-----  tfidf  -----
Train AUC:        0.9632
Train AP:         0.9779
Train Precision:  0.9861
Train Recall:     0.9290
Dev   AUC:        0.7506
Dev   AP:         0.2299
Dev   Precision:  0.2048
Dev   Recall:     0.6405
-----  binary  -----
Train AUC:        0.9998
Train AP:         0.9998
Train Precision:  0.9916
Train Recall:     0.9968
Dev   AUC:        0.6819
Dev   AP:         0.1692
Dev   Precision:  0.1738
Dev   Recall:     0.4866
