In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV as LGCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

# Q1. Read in, split 70/30

In [2]:
#reading in dataset
amazon = pd.read_csv('amazon_need.csv')

#mapping __label__1 to 1 and __label__2 to 0 as instructions
amazon['label'] = amazon.label.map(lambda x: 1 if x == '__label__1' else 0)

x = amazon['review']
y = amazon['label']

#splitting data
x_train, x_val, y_train, y_val = train_test_split(x, y, train_size = .7, random_state = 13)

# Q2. Train random forests on: bag of words, TF-IDF, Word2Vec (avg), Word2Vec (sum)

### Bag of words

In [3]:
count_vector = CountVectorizer(ngram_range = (1,1), stop_words = "english", lowercase = True)

In [4]:
#using fit_transform to fit training reviews & create dict matrix 
train_counts = count_vector.fit_transform(x_train)

#creating DF from dict matrix created above
train_bag = pd.DataFrame(train_counts.toarray(), columns=count_vector.get_feature_names())

#finding word occurence in df
word_counts = train_bag.sum()

#creating variable of only words that have an occurence of at least 20
freq_train_bag = train_bag[word_counts[word_counts >= 20].index]

#standardizing 
freq_train_bag = (freq_train_bag - freq_train_bag.mean()) / freq_train_bag.std()

In [5]:
#using transform on validation reviews, not fit_transform as it was fit on the training data
val_counts = count_vector.transform(x_val)

#similar as above, but for val set 
val_bag = pd.DataFrame(val_counts.toarray(), columns=count_vector.get_feature_names())

#reducing
freq_val_train = val_bag[freq_train_bag.columns]

In [6]:
#setting parameters - smaller than usual ranges b/c computation time was excessive
parameters = {"n_estimators":[100,200,300,400], "min_samples_leaf": [5,10,15,20,25,30], "min_samples_split": [10,20,30,40,50], 'max_depth': [2,4,6,8]}

In [7]:
#creating rforest to plug into randomizedsearchcv. using verbose=10 b/c I like validation that the model is running.
rforest = RandomForestClassifier(random_state=13, verbose=10)

In [8]:
#creating randomizedsearchcv with given params, using rforest.
rforest_bow = RandomizedSearchCV(rforest, parameters, n_jobs=4, n_iter=13, scoring = "roc_auc")

In [9]:
#WILL RUN FOR ~3 MIN!
#training RFmodel on the "frequent" bag of words and their corresponding label.
rforest_bow.fit(freq_train_bag, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.2s remaining:    0.0s


building tree 1 of 300
building tree 2 of 300
building tree 3 of 300
building tree 4 of 300
building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
b

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    6.8s finished


RandomizedSearchCV(estimator=RandomForestClassifier(random_state=13,
                                                    verbose=10),
                   n_iter=13, n_jobs=4,
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'min_samples_leaf': [5, 10, 15, 20, 25,
                                                             30],
                                        'min_samples_split': [10, 20, 30, 40,
                                                              50],
                                        'n_estimators': [100, 200, 300, 400]},
                   scoring='roc_auc')

In [10]:
params = rforest_bow.best_params_
for x, y in params.items():
    print('The best', x, 'parameter for the model above is', y)

The best n_estimators parameter for the model above is 300
The best min_samples_split parameter for the model above is 30
The best min_samples_leaf parameter for the model above is 5
The best max_depth parameter for the model above is 6


In [11]:
#finding how the model's predicted probabilities of the words' labels
y_train_prob = rforest_bow.predict_proba(freq_train_bag)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.2s finished


In [12]:
#Finding AUC of the training set. 
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
tbow_auc = metrics.auc(fpr, tpr)

In [13]:
#Finding predicted probabilities of the validation dataset using model trained on training data.
y_val_prob = rforest_bow.predict_proba(freq_val_train)

#Finding AUC of the validation set. 
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
vbow_auc = metrics.auc(fpr, tpr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.2s finished


In [14]:
print('The BOW training data set had an AUC score  of', tbow_auc, 
      'while the BOW validation data set had an AUC score of', vbow_auc, 'for a difference of:', tbow_auc - vbow_auc)

The BOW training data set had an AUC score  of 0.902240804129517 while the BOW validation data set had an AUC score of 0.7256553455865971 for a difference of: 0.17658545854291996


In [22]:
#Top 30 features
feats = pd.DataFrame()
feats["var"] = freq_train_bag.columns
feats["importance"] = rforest_bow.best_estimator_.feature_importances_
feats = feats.sort_values("importance", ascending = False).reset_index(drop = True)
feats[:30]

Unnamed: 0,var,importance
0,great,0.052642
1,waste,0.044548
2,money,0.037662
3,worst,0.033966
4,love,0.030147
5,boring,0.030025
6,best,0.030007
7,excellent,0.022225
8,bad,0.02167
9,disappointed,0.020243


### TF-IDF

In [23]:
#creating TF-IDF vectorizer using unigram
tf = TfidfVectorizer(ngram_range = (1,1), stop_words = "english", lowercase = True)

In [24]:
#fitting TF-IDF vectorizer on training data reviews
train_tf = tf.fit_transform(x_train)

#taking dict matrix of words/value into an array
train_tf = pd.DataFrame(train_tf.toarray(), columns = tf.get_feature_names())

#again selecting only words that were used at least 20 times
train_tf = train_tf[word_counts[word_counts >= 20].index]

In [25]:
#Same as above, but for validation data, and using transform, not fit_transform. 
val_tf = tf.transform(x_val)
val_tf = pd.DataFrame(val_tf.toarray(), columns = tf.get_feature_names())
val_tf = val_tf[word_counts[word_counts >= 20].index]

In [26]:
#same as done previously, using randomizedsearch on rforest, saved under new variable for my ease to keep best params
#per text mining model
rforest_tf = RandomizedSearchCV(rforest, parameters, n_jobs=4, n_iter= 13, scoring = "roc_auc")

In [27]:
#WILL RUN FOR ~5 MIN!
#fitting rforest model on training frequent words TF with their labels
rforest_tf.fit(train_tf, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


building tree 1 of 400
building tree 2 of 400
building tree 3 of 400


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s


building tree 4 of 400
building tree 5 of 400
building tree 6 of 400


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s


building tree 7 of 400
building tree 8 of 400
building tree 9 of 400


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.8s remaining:    0.0s


building tree 10 of 400
building tree 11 of 400
building tree 12 of 400
building tree 13 of 400
building tree 14 of 400
building tree 15 of 400
building tree 16 of 400
building tree 17 of 400
building tree 18 of 400
building tree 19 of 400
building tree 20 of 400
building tree 21 of 400
building tree 22 of 400
building tree 23 of 400
building tree 24 of 400
building tree 25 of 400
building tree 26 of 400
building tree 27 of 400
building tree 28 of 400
building tree 29 of 400
building tree 30 of 400
building tree 31 of 400
building tree 32 of 400
building tree 33 of 400
building tree 34 of 400
building tree 35 of 400
building tree 36 of 400
building tree 37 of 400
building tree 38 of 400
building tree 39 of 400
building tree 40 of 400
building tree 41 of 400
building tree 42 of 400
building tree 43 of 400
building tree 44 of 400
building tree 45 of 400
building tree 46 of 400
building tree 47 of 400
building tree 48 of 400
building tree 49 of 400
building tree 50 of 400
building tree 51

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   33.2s finished


RandomizedSearchCV(estimator=RandomForestClassifier(random_state=13,
                                                    verbose=10),
                   n_iter=13, n_jobs=4,
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'min_samples_leaf': [5, 10, 15, 20, 25,
                                                             30],
                                        'min_samples_split': [10, 20, 30, 40,
                                                              50],
                                        'n_estimators': [100, 200, 300, 400]},
                   scoring='roc_auc')

In [28]:
params = rforest_tf.best_params_
for x, y in params.items():
    print('The best', x, 'parameter for the model above is', y)

The best n_estimators parameter for the model above is 400
The best min_samples_split parameter for the model above is 10
The best min_samples_leaf parameter for the model above is 5
The best max_depth parameter for the model above is 8


In [29]:
#getting predicted probs for training dataset
y_train_prob = rforest_tf.predict_proba(train_tf)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
ttf_auc = metrics.auc(fpr, tpr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished


In [30]:
#same as above, for validation set
y_val_prob = rforest_tf.predict_proba(val_tf)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
vtf_auc = metrics.auc(fpr, tpr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.3s finished


In [31]:
print('The TF-IDF training data set had an AUC score  of', ttf_auc, 
      'while the TF-IDF validation data set had an AUC score of', vtf_auc, 'for a difference of:', ttf_auc - vtf_auc)

The TF-IDF training data set had an AUC score  of 0.9143265353116066 while the TF-IDF validation data set had an AUC score of 0.8990273107611724 for a difference of: 0.015299224550434198


In [32]:
#Top 30 features
feats["var"] = freq_train_bag.columns
feats["importance"] = rforest_tf.best_estimator_.feature_importances_
feats = feats.sort_values("importance", ascending = False).reset_index(drop = True)
feats[:30]

Unnamed: 0,var,importance
0,great,0.060761
1,waste,0.042106
2,money,0.036572
3,best,0.036047
4,worst,0.0294
5,love,0.02837
6,bad,0.025616
7,don,0.023704
8,boring,0.023455
9,terrible,0.020934


### Word2Vec - Average

In [33]:
#Installations needed for using word2vec
#conda install -c anaconda gensim 
#nltk.download("wordnet")
#nltk.download("stopwords")

#was necessary for me b/c I'm using an older python version 
import smart_open
smart_open.open = smart_open.smart_open
from gensim.models import Word2Vec
from tqdm import tqdm
import numpy as np

In [34]:
#using wordnet's lemmatizer: "dogs" -> "dog", "cacti" -> "cactus"
lemmatizer = WordNetLemmatizer()

#Creating function to clean text, using from Professor's notebook with some minimal changes 
def text_cleaner(x):
    x = x.lower()
    #using word_tokenize to separate words & punct into their own tokens
    tokens = nltk.tokenize.word_tokenize(x)
    #for loop - takes all tokens that were made from above & only have alpha characters to lemmatize them.
    lem_alpha_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
    #for loop - runs through the lemmatized alpha-only characters & only keeps tokens that are not a stop word 
    lem_alpha_tokens = [token for token in lem_alpha_tokens if token not in nltk.corpus.stopwords.words("english")]
    return lem_alpha_tokens

In [36]:
#using map function to take each row of reviews & clean them as per the function above
x_train_clean = x_train.map(text_cleaner)

#same as above, but for validation set reviews
x_val_clean = x_val.map(text_cleaner)

In [37]:
#creating word embedding vectors for the train and val sets

word2vec_train = Word2Vec(x_train_clean, min_count = 5, size = 300, window = 4)
word2vec_val = Word2Vec(x_val_clean, min_count = 5, size = 300, window = 4)

In [38]:
def train_avg_embedding(word2vec):
    #for loop - finding avg embedding of each word; using Prof code
    train_avg_embeddings = []

    #setting for loop to go through the # of rows of cleaned words
    for index in tqdm(range(x_train_clean.shape[0])):
        doc = x_train_clean.iloc[index]

        #assigns words(aliased by index) to their embeddings - skips words not counted for my word2vec model (those with min_count <5)
        embeddings = [word2vec.wv[word] for word in doc if word in word2vec.wv]

        #if no words gave embeddings from model above, create an empty list with same size used in word2vec model
        if embeddings == []:
            train_avg_embeddings.append([np.nan] * 300)

        #if there are embeddings, find the mean of each word(axis=0) and add it to list
        else:
            avg = np.mean(embeddings, axis = 0)
            train_avg_embeddings.append(avg)
    return train_avg_embeddings

            
#same as above, but for validation dataset            
def val_avg_embedding(word2vec):
    val_avg_embeddings = []
    for index in tqdm(range(x_val_clean.shape[0])):
        doc = x_val_clean.iloc[index]
        embeddings = [word2vec.wv[word] for word in doc if word in word2vec.wv]
        if embeddings == []:
            val_avg_embeddings.append([np.nan] * 300)
        else:
            avg = np.mean(embeddings, axis = 0)
            val_avg_embeddings.append(avg)
    return val_avg_embeddings

train_avg_embeddings = train_avg_embedding(word2vec_train)
val_avg_embeddings = val_avg_embedding(word2vec_val)

word2vec_features_train = pd.DataFrame(train_avg_embeddings)
word2vec_features_val = pd.DataFrame(val_avg_embeddings)

100%|██████████| 17500/17500 [00:01<00:00, 8963.92it/s]
100%|██████████| 7500/7500 [00:00<00:00, 9606.61it/s]


In [39]:
#Creating RForest using randomized search as previously done
rforest_w2v = RandomizedSearchCV(rforest, parameters, n_jobs=4, n_iter=13, scoring = "roc_auc")

In [40]:
#Training/fitting model using training word embeddings and their corresponding labels
rforest_w2v.fit(word2vec_features_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 200
building tree 2 of 200


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s


building tree 3 of 200
building tree 4 of 200


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s


building tree 5 of 200
building tree 6 of 200


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.9s remaining:    0.0s


building tree 7 of 200
building tree 8 of 200


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.1s remaining:    0.0s


building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200
building tree 34 of 200
building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
building tree 43 of 200
building tree 44 of 200
building tree 45 of 200
building tree 46 of 200
building tree 47 of 200
building tree 48 of 200
building tree 49 of 200
building tree 50 

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   24.8s finished


RandomizedSearchCV(estimator=RandomForestClassifier(random_state=13,
                                                    verbose=10),
                   n_iter=13, n_jobs=4,
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'min_samples_leaf': [5, 10, 15, 20, 25,
                                                             30],
                                        'min_samples_split': [10, 20, 30, 40,
                                                              50],
                                        'n_estimators': [100, 200, 300, 400]},
                   scoring='roc_auc')

In [41]:
params = rforest_w2v.best_params_
for x, y in params.items():
    print('The best', x, 'parameter for the model above is', y)

The best n_estimators parameter for the model above is 200
The best min_samples_split parameter for the model above is 50
The best min_samples_leaf parameter for the model above is 20
The best max_depth parameter for the model above is 8


In [42]:
#finding how model above would predict labels for training data
y_train_prob = rforest_w2v.predict_proba(word2vec_features_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished


In [43]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
tw2v_avg_auc = metrics.auc(fpr, tpr)

In [44]:
# get predicted probabilities on validation set
y_val_prob = rforest_w2v.predict_proba(word2vec_features_val)

# Get AUC on the validation set
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
vw2v_avg_auc = metrics.auc(fpr, tpr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.1s finished


In [45]:
print('The W2V training data set had an AUC score  of', tw2v_avg_auc, 
      'while the W2V validation data set had an AUC score of', vw2v_avg_auc, 'for a difference of:', tw2v_avg_auc - vw2v_avg_auc)

The W2V training data set had an AUC score  of 0.8829909503399664 while the W2V validation data set had an AUC score of 0.6735071616800817 for a difference of: 0.2094837886598847


In [68]:
#Top 30 features
blah = pd.DataFrame()
blah['var'] = word2vec_features_train.columns
blah['importance'] = rforest_w2v.best_estimator_.feature_importances_
blah = blah.sort_values("importance", ascending = False).reset_index(drop = True)
blah[:30]

Unnamed: 0,var,importance
0,262,0.043867
1,155,0.03897
2,222,0.032065
3,196,0.032027
4,76,0.029327
5,111,0.026743
6,231,0.025766
7,217,0.024426
8,30,0.022051
9,128,0.019337


### Word2Vec - Sum

In [69]:
#Next few cells repeat same process as modeling above, however, instead of finding the avg of the embeddings/word,
#the sum is found for each. 

train_sum_embeddings = []

for index in tqdm(range(x_train_clean.shape[0])):
    doc = x_train_clean.iloc[index]
    embeddings = [word2vec_train.wv[word] for word in doc if word in word2vec_train.wv]
    if embeddings == []:
        train_sum_embeddings.append([np.nan] * 300)
    else:
        summ = np.sum(embeddings, axis = 0)
        train_sum_embeddings.append(summ)

100%|██████████| 17500/17500 [00:01<00:00, 9074.45it/s]


In [70]:
val_sum_embeddings = []

for index in tqdm(range(x_val_clean.shape[0])):
    doc = x_val_clean.iloc[index]
    embeddings = [word2vec_val.wv[word] for word in doc if word in word2vec_val.wv]
    if embeddings == []:
        val_sum_embeddings.append([np.nan] * 300)
    else:
        # otherwise, just get the mean componentwise across all the word embeddings and append to our list
        summ = np.sum(embeddings, axis = 0)
        val_sum_embeddings.append(summ)

100%|██████████| 7500/7500 [00:00<00:00, 9604.95it/s]


In [71]:
word2vec_features_train_sum = pd.DataFrame(train_sum_embeddings)
word2vec_features_val_sum = pd.DataFrame(val_sum_embeddings)

In [72]:
rforest_w2v_sum = RandomizedSearchCV(rforest, parameters, n_jobs=4, n_iter=13, scoring = "roc_auc")

In [73]:
rforest_w2v_sum.fit(word2vec_features_train_sum, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 300
building tree 2 of 300


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s remaining:    0.0s


building tree 3 of 300
building tree 4 of 300


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s remaining:    0.0s


building tree 5 of 300
building tree 6 of 300


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.1s remaining:    0.0s


building tree 7 of 300
building tree 8 of 300


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.3s remaining:    0.0s


building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300
building tree 47 of 300
building tree 48 of 300
building tree 49 of 300
building tree 50 

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   43.7s finished


RandomizedSearchCV(estimator=RandomForestClassifier(random_state=13,
                                                    verbose=10),
                   n_iter=13, n_jobs=4,
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'min_samples_leaf': [5, 10, 15, 20, 25,
                                                             30],
                                        'min_samples_split': [10, 20, 30, 40,
                                                              50],
                                        'n_estimators': [100, 200, 300, 400]},
                   scoring='roc_auc')

In [74]:
params = rforest_w2v_sum.best_params_

for x, y in params.items():
    print('The best', x, 'parameter for the model above is', y)

The best n_estimators parameter for the model above is 300
The best min_samples_split parameter for the model above is 30
The best min_samples_leaf parameter for the model above is 20
The best max_depth parameter for the model above is 8


In [75]:
#finding predicted probs for training data and using that to find AUC
y_train_prob = rforest_w2v_sum.predict_proba(word2vec_features_train_sum)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
tw2v_sum_auc = metrics.auc(fpr, tpr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished


In [76]:
#same as above, for validation data
y_val_prob = rforest_w2v_sum.predict_proba(word2vec_features_val_sum)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
vw2v_sum_auc = metrics.auc(fpr, tpr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished


In [77]:
print('The W2V training data set had an AUC score  of', tw2v_sum_auc, 
      'while the W2V validation data set had an AUC score of', vw2v_sum_auc, 'for a difference of:', tw2v_sum_auc - vw2v_sum_auc)

The W2V training data set had an AUC score  of 0.8784761916406341 while the W2V validation data set had an AUC score of 0.6844633072202403 for a difference of: 0.19401288442039377


In [78]:
#Top 30 features
blah = pd.DataFrame()
blah['var'] = word2vec_features_train_sum.columns
blah['importance'] = rforest_w2v_sum.best_estimator_.feature_importances_
blah = blah.sort_values("importance", ascending = False).reset_index(drop = True)
blah[:30]

Unnamed: 0,var,importance
0,222,0.04864
1,196,0.047732
2,76,0.044951
3,111,0.038716
4,231,0.032922
5,99,0.02554
6,208,0.022918
7,17,0.019517
8,144,0.018726
9,217,0.016289


## Q2. 4) Which set of features performed the best? How else do you think you could improve this result?

### From running random forests on the text mining and NLP methods above, the best performance came from using TF-IDF. Using TF-IDF, a validation AUC of .89 was acheived, which is quite higher than the other methods used. TF-IDF also had a small difference between training and validation AUC of 0.00766, showing that the model generalized quite well. 

### The performance of all models ran above could be improved by the expansion of the RandomizedSearchGrid; by allowing for increased n_estimators, min_samples_split, etc., the performance could be improved by getting closer to the optimal parameters. Additionally, I used a unigram for the words; testing out bigrams/trigrams/(skip gram for word2vec) on the models could see increased performance. For the Word2Vec methods, tuning the parameter of window_size parameter could give performance improvement. Specifically for the Word2Vec methods, I believe that a larger training set could also lead to higher performance. Lastly, for the word2vec models, the embeddings were used to find embedding averages and summations, but other features could be created by using min/max.

# Q3. Use window_size values 2-5 on W2V models; report AUC, give intuition on results.

In [79]:
word2vec_train2 = Word2Vec(x_train_clean, min_count = 5, size = 300, window = 2)
word2vec_val2 = Word2Vec(x_val_clean, min_count = 5, size = 300, window = 2)

word2vec_train3 = Word2Vec(x_train_clean, min_count = 5, size = 300, window = 3)
word2vec_val3 = Word2Vec(x_val_clean, min_count = 5, size = 300, window = 3)

word2vec_train4 = Word2Vec(x_train_clean, min_count = 5, size = 300, window = 4)
word2vec_val4 = Word2Vec(x_val_clean, min_count = 5, size = 300, window = 4)

word2vec_train5 = Word2Vec(x_train_clean, min_count = 5, size = 300, window = 5)
word2vec_val5 = Word2Vec(x_val_clean, min_count = 5, size = 300, window = 5)

In [80]:
tx2 = train_avg_embedding(word2vec_train2)
tx2 = pd.DataFrame(tx2)
vx2 = val_avg_embedding(word2vec_val2)
vx2 = pd.DataFrame(vx2)

for x in (tx2, vx2):
    x = (x - x.mean()) / x.std()

100%|██████████| 17500/17500 [00:01<00:00, 8924.34it/s]
100%|██████████| 7500/7500 [00:00<00:00, 9523.06it/s]


In [81]:
log = LGCV(random_state = 13, cv=5, penalty='l2', solver='liblinear', max_iter=1000, dual=False).fit(tx2, y_train)

y_train_predprob = log.predict_proba(tx2)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_predprob[:,1], pos_label = 1)
print("The training dataset AUC is ", metrics.auc(fpr, tpr))

y_val_predprob = log.predict_proba(vx2)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_predprob[:,1], pos_label = 1)
print("The validation dataset AUC is ", metrics.auc(fpr, tpr))

The training dataset AUC is  0.898111720719728
The validation dataset AUC is  0.5578340132639829


In [82]:
tx3 = train_avg_embedding(word2vec_train3)
tx3 = pd.DataFrame(tx3)
vx3 = val_avg_embedding(word2vec_val3)
vx3 = pd.DataFrame(vx3)

for x in (tx3, vx3):
    x = (x - x.mean()) / x.std()

log = log.fit(tx3, y_train)

y_train_predprob = log.predict_proba(tx3)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_predprob[:,1], pos_label = 1)
print("The training dataset AUC is ", metrics.auc(fpr, tpr))

y_val_predprob = log.predict_proba(vx3)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_predprob[:,1], pos_label = 1)
print("The validation dataset AUC is ", metrics.auc(fpr, tpr))

100%|██████████| 17500/17500 [00:02<00:00, 7948.21it/s]
100%|██████████| 7500/7500 [00:00<00:00, 8569.49it/s]


The training dataset AUC is  0.9026759085569309
The validation dataset AUC is  0.5304178265609167


In [83]:
tx4 = train_avg_embedding(word2vec_train4)
tx4 = pd.DataFrame(tx4)
vx4 = val_avg_embedding(word2vec_val4)
vx4 = pd.DataFrame(vx4)

for x in (tx4, vx4):
    x = (x - x.mean()) / x.std()

log = log.fit(tx4, y_train)

y_train_predprob = log.predict_proba(tx4)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_predprob[:,1], pos_label = 1)
print("The training dataset AUC is ", metrics.auc(fpr, tpr))

y_val_predprob = log.predict_proba(vx4)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_predprob[:,1], pos_label = 1)
print("The validation dataset AUC is ", metrics.auc(fpr, tpr))

100%|██████████| 17500/17500 [00:02<00:00, 7426.41it/s]
100%|██████████| 7500/7500 [00:01<00:00, 5357.35it/s]


The training dataset AUC is  0.9054479897049451
The validation dataset AUC is  0.6020738240500534


In [84]:
tx5 = train_avg_embedding(word2vec_train5)
tx5 = pd.DataFrame(tx5)
vx5 = val_avg_embedding(word2vec_val5)
vx5 = pd.DataFrame(vx5)

for x in (tx5, vx5):
    x = (x - x.mean()) / x.std()

log = log.fit(tx5, y_train)

y_train_predprob = log.predict_proba(tx5)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_predprob[:,1], pos_label = 1)
print("The training dataset AUC is ", metrics.auc(fpr, tpr))

y_val_predprob = log.predict_proba(vx5)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_predprob[:,1], pos_label = 1)
print("The validation dataset AUC is ", metrics.auc(fpr, tpr))

100%|██████████| 17500/17500 [00:02<00:00, 8608.43it/s]
100%|██████████| 7500/7500 [00:00<00:00, 8082.64it/s]


The training dataset AUC is  0.9070457369484739
The validation dataset AUC is  0.5820775459554299


### The best performing model of varying window_size is the model that uses window_size = 4. I would expect the model with the largest window_size to be the best performer, given it looks at more surrounding words to determine a word's context. Grading off of training AUC's, this logic is supported as the window_size=5 model has the highest training AUC, however, it does not have the best AUC for it's validation data. This could be because at a range of 4 words from the word predicted, those surrounding words could be more related, therefore allowing the model to generalize more accurately. At windowsize=5, a word 5 words away may be unrelated to the word we're trying to predict and lead to lower performance.