## NLP Classification

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist # looks at how frequent words are used
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
# from matplotlib import cm
from sklearn.ensemble import RandomForestClassifier 
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import string, re

### Import Dataset

In [2]:
df = pd.read_csv('../data/dataframe.csv', index_col=0)
df.head(2)

Unnamed: 0,tweet,product_,emotion,lemmatizer_tweets,textblob_polarity,textblob_subjectivity,vs_neg,vs_neu,vs_pos,vs_compound,nrc_sentiment,gi_sentiment,henry_sentiment,huliu_sentiment,jockers_sentiment,lm_sentiment,senticnet_sentiment,sentiword_sentiment,socal_sentiment
0,"['wesley83', 'have', '3G', 'iPhone', '3', 'hrs...",iPhone,0,wesley83 have 3G iPhone 3 hr tweeting RISE Aus...,-0.2,0.4,0.223,0.777,0.0,-0.6486,0.0,-0.333333,0.0,-1.0,-1.0,0.0,-0.0952,-0.221875,-1.192154
1,"['jessedee', 'Know', 'fludapp', 'Awesome', 'iP...",iPad or iPhone App,1,jessedee Know fludapp Awesome iPad iPhone app ...,0.466667,0.933333,0.0,0.528,0.472,0.91,1.0,1.0,0.0,1.0,0.416667,0.0,0.475,0.175,2.17719


### Modeling

In [3]:
## setting stopwords and punctuations
stop_words=stopwords.words("english")
stop_words += list(string.punctuation)
stop_words += ['...','u','w','2',"i'm",'via',"we're",'6','3','hey']
sw_set = set(stop_words)

In [4]:
neg = df[df.emotion==0]
pos = df[df.emotion==1]
idk = df[df.emotion==4]
neu = df[df.emotion==3]

In [5]:
# upsample negative
neg_upsampled = resample(neg,
                          replace=True, # sample with replacement
                          n_samples=len(neu), # match number in majority class
                          random_state=23) # reproducible results

# upsample positive
pos_upsampled = resample(pos,
                          replace=True, # sample with replacement
                          n_samples=len(neu), # match number in majority class
                          random_state=23) # reproducible results

# upsample unclear
idk_upsampled = resample(idk,
                          replace=True, # sample with replacement
                          n_samples=len(neu), # match number in majority class
                          random_state=23) # reproducible results

upsampled = pd.concat([neu, neg_upsampled, pos_upsampled, idk_upsampled])

In [6]:
# creating a list with all lemmatized outputs
data = upsampled['lemmatizer_tweets']
lemmatized_output = []

for listy in data:
    lemmed = ''.join([w for w in listy])
    lemmatized_output.append(lemmed)

In [7]:
# define x and y
X_lem = lemmatized_output
y_lem = upsampled['emotion']

In [8]:
# train test split the lemmatized words
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.20, random_state=1)
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words)

# fit and transform
tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

In [9]:
# Average number of non-zero elements in vectorized tweets
non_zero_cols = tfidf_data_train_lem.nnz / float(tfidf_data_train_lem.shape[0])
print(non_zero_cols)

# Percentage of columns containing zero
percent_sparse = 1 - (non_zero_cols / float(tfidf_data_train_lem.shape[1]))
print(percent_sparse)

22.46952032944725
0.9994795107637376


In [10]:
rf_lem = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)


In [11]:
rf_lem.fit(tfidf_data_train_lem, y_train_lem)

RandomForestClassifier(n_jobs=-1, random_state=0)

In [12]:
rf_test_preds_lem = rf_lem.predict(tfidf_data_test_lem)

In [13]:
rf_acc_score_lem = metrics.accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = metrics.f1_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_precision_score_lem = metrics.precision_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_recall_score_lem = metrics.recall_score(y_test_lem, rf_test_preds_lem, average='weighted')
print('Accuracy:', rf_acc_score_lem)
print('Precision:',rf_precision_score_lem)
print('Recall:',rf_recall_score_lem)
print('F1:',rf_f1_score_lem)

Accuracy: 0.9220598469032707
Precision: 0.9210386351303297
Recall: 0.9220598469032707
F1: 0.9212052860626472


### Random Forest using Grid Search

#### Iteration 1

In [14]:
rfc=RandomForestClassifier(random_state=42)

In [15]:
param_grid = { 
    'n_estimators': [150, 200],
    'max_features': ['auto'],
    'max_depth' : [8],
    'criterion' :['gini']
}

In [16]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [8],
                         'max_features': ['auto'], 'n_estimators': [150, 200]})

In [17]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6670137414994359
{'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 200}
RandomForestClassifier(max_depth=8, n_estimators=200, random_state=42)


#### Iteration 2

In [18]:
param_grid = { 
    'n_estimators': [100, 150],
    'max_features': ['auto'],
    'max_depth' : [8,12],
    'criterion' :['gini']
}

In [19]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [8, 12],
                         'max_features': ['auto'], 'n_estimators': [100, 150]})

In [20]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.7007704576934436
{'criterion': 'gini', 'max_depth': 12, 'max_features': 'auto', 'n_estimators': 150}
RandomForestClassifier(max_depth=12, n_estimators=150, random_state=42)


#### Iteration 3

In [21]:
param_grid = { 
    'n_estimators': [140, 150, 160],
    'max_features': ['auto'],
    'max_depth' : [12],
    'criterion' :['gini']
}

In [22]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [12],
                         'max_features': ['auto'],
                         'n_estimators': [140, 150, 160]})

In [23]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.7007704576934436
{'criterion': 'gini', 'max_depth': 12, 'max_features': 'auto', 'n_estimators': 150}
RandomForestClassifier(max_depth=12, n_estimators=150, random_state=42)


#### Iteration 4

In [24]:
param_grid = { 
    'n_estimators': [138, 140, 142],
    'max_features': ['auto'],
    'max_depth' : [12],
    'criterion' :['gini']
}

In [25]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [12],
                         'max_features': ['auto'],
                         'n_estimators': [138, 140, 142]})

In [26]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6987404298229623
{'criterion': 'gini', 'max_depth': 12, 'max_features': 'auto', 'n_estimators': 142}
RandomForestClassifier(max_depth=12, n_estimators=142, random_state=42)


#### Iteration 5

In [27]:
param_grid = { 
    'n_estimators': list(range(135,140)),
    'max_features': ['auto'],
    'max_depth' : list(range(10,20)),
    'criterion' :['gini']
}

In [28]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'],
                         'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
                         'max_features': ['auto'],
                         'n_estimators': [135, 136, 137, 138, 139]})

In [29]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.7514052628994315
{'criterion': 'gini', 'max_depth': 19, 'max_features': 'auto', 'n_estimators': 139}
RandomForestClassifier(max_depth=19, n_estimators=139, random_state=42)


#### Iteration 6

In [30]:
param_grid = { 
    'n_estimators': list(range(137,139)),
    'max_features': ['auto'],
    'max_depth' : list(range(17,25)),
    'criterion' :['gini', 'entropy']
}

In [31]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [17, 18, 19, 20, 21, 22, 23, 24],
                         'max_features': ['auto'], 'n_estimators': [137, 138]})

In [32]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.7950220616083616
{'criterion': 'entropy', 'max_depth': 24, 'max_features': 'auto', 'n_estimators': 137}
RandomForestClassifier(criterion='entropy', max_depth=24, n_estimators=137,
                       random_state=42)
