## NLP Classification

In [3]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist # looks at how frequent words are used
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
# from matplotlib import cm
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import string, re

### Import Dataset

In [27]:
df = pd.read_csv('../data/dataframe.csv', index_col=0)
df.head(2)

Unnamed: 0,tweet,product_,emotion,lemmatizer_tweets,textblob_polarity,textblob_subjectivity,vs_neg,vs_neu,vs_pos,vs_compound,nrc_sentiment,gi_sentiment,henry_sentiment,huliu_sentiment,jockers_sentiment,lm_sentiment,senticnet_sentiment,sentiword_sentiment,socal_sentiment
0,"['wesley83', 'have', '3G', 'iPhone', '3', 'hrs...",iPhone,0,wesley83 have 3G iPhone 3 hr tweeting RISE Aus...,-0.2,0.4,0.223,0.777,0.0,-0.6486,0.0,-0.333333,0.0,-1.0,-1.0,0.0,-0.0952,-0.221875,-1.192154
1,"['jessedee', 'Know', 'fludapp', 'Awesome', 'iP...",iPad or iPhone App,1,jessedee Know fludapp Awesome iPad iPhone app ...,0.466667,0.933333,0.0,0.528,0.472,0.91,1.0,1.0,0.0,1.0,0.416667,0.0,0.475,0.175,2.17719


In [26]:
## setting stopwords and punctuations
stop_words=stopwords.words("english")
stop_words += list(string.punctuation)
stop_words += ['...','u','w','2',"i'm",'via',"we're",'6','3','hey']
sw_set = set(stop_words)

### Modeling

In [12]:
data = df['lemmatizer_tweets']
target = df['emotion']

In [13]:
# creating a list with all lemmatized outputs
lemmatized_output = []

for listy in data:
    lemmed = ''.join([w for w in listy])
    lemmatized_output.append(lemmed)

In [14]:
X_lem = lemmatized_output
y_lem = target

In [15]:
# train test split the lemmatized words
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.20, random_state=1)
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words)

tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

In [16]:
# Average Number of Non-Zero Elements in Vectorized Tweets
non_zero_cols = tfidf_data_train_lem.nnz / float(tfidf_data_train_lem.shape[0])
print(non_zero_cols)

# Percentage of columns containing ZERO
percent_sparse = 1 - (non_zero_cols / float(tfidf_data_train_lem.shape[1]))
print(percent_sparse)

22.60415234428709
0.9994728017458652


In [17]:
rf_lem = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)


In [18]:
rf_lem.fit(tfidf_data_train_lem, y_train_lem)

RandomForestClassifier(n_jobs=-1, random_state=0)

In [19]:
rf_test_preds_lem = rf_lem.predict(tfidf_data_test_lem)

In [20]:
rf_acc_score_lem = metrics.accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = metrics.f1_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_precision_score_lem = metrics.precision_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_recall_score_lem = metrics.recall_score(y_test_lem, rf_test_preds_lem, average='weighted')
print('Accuracy:', rf_acc_score_lem)
print('Precision:',rf_precision_score_lem)
print('Recall:',rf_recall_score_lem)
print('F1:',rf_f1_score_lem)

Accuracy: 0.6547553600879604
Precision: 0.6493556607054446
Recall: 0.6547553600879604
F1: 0.6133479327245666


  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest using Grid Search

#### Iteration 1

In [21]:
rfc=RandomForestClassifier(random_state=42)

In [22]:
param_grid = { 
    'n_estimators': [150, 200],
    'max_features': ['auto'],
    'max_depth' : [8],
    'criterion' :['gini']
}

In [23]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [8],
                         'max_features': ['auto'], 'n_estimators': [150, 200]})

In [24]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6016770893896208
{'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 150}
RandomForestClassifier(max_depth=8, n_estimators=150, random_state=42)


#### Iteration 2

In [25]:
param_grid = { 
    'n_estimators': [100, 150],
    'max_features': ['auto'],
    'max_depth' : [8,12],
    'criterion' :['gini']
}

In [73]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [8, 12],
                         'max_features': ['auto'], 'n_estimators': [100, 150]})

In [74]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6075896330539761
{'criterion': 'gini', 'max_depth': 12, 'max_features': 'auto', 'n_estimators': 150}
RandomForestClassifier(max_depth=12, n_estimators=150, random_state=42)


#### Iteration 3

In [79]:
param_grid = { 
    'n_estimators': [140, 150, 160],
    'max_features': ['auto'],
    'max_depth' : [12],
    'criterion' :['gini']
}

In [80]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [12],
                         'max_features': ['auto'],
                         'n_estimators': [140, 150, 160]})

In [81]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6075896330539761
{'criterion': 'gini', 'max_depth': 12, 'max_features': 'auto', 'n_estimators': 140}
RandomForestClassifier(max_depth=12, n_estimators=140, random_state=42)


#### Iteration 4

In [91]:
param_grid = { 
    'n_estimators': [138, 140, 142],
    'max_features': ['auto'],
    'max_depth' : [12],
    'criterion' :['gini']
}

In [92]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'], 'max_depth': [12],
                         'max_features': ['auto'],
                         'n_estimators': [138, 140, 142]})

In [93]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6077270900986496
{'criterion': 'gini', 'max_depth': 12, 'max_features': 'auto', 'n_estimators': 138}
RandomForestClassifier(max_depth=12, n_estimators=138, random_state=42)


#### Iteration 5

In [95]:
param_grid = { 
    'n_estimators': list(range(135,140)),
    'max_features': ['auto'],
    'max_depth' : list(range(10,20)),
    'criterion' :['gini']
}

In [96]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'],
                         'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
                         'max_features': ['auto'],
                         'n_estimators': [135, 136, 137, 138, 139]})

In [97]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6107522795275033
{'criterion': 'gini', 'max_depth': 19, 'max_features': 'auto', 'n_estimators': 136}
RandomForestClassifier(max_depth=19, n_estimators=136, random_state=42)


#### Iteration 6

In [98]:
param_grid = { 
    'n_estimators': list(range(137,139)),
    'max_features': ['auto'],
    'max_depth' : list(range(17,25)),
    'criterion' :['gini', 'entropy']
}

In [99]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [17, 18, 19, 20, 21, 22, 23, 24],
                         'max_features': ['auto'], 'n_estimators': [137, 138]})

In [100]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6132272626289842
{'criterion': 'gini', 'max_depth': 24, 'max_features': 'auto', 'n_estimators': 137}
RandomForestClassifier(max_depth=24, n_estimators=137, random_state=42)


#### Iteration 7

In [103]:
param_grid = { 
    'n_estimators': list(range(130,140)),
    'max_features': ['auto'],
    'max_depth' : list(range(18,40)),
    'criterion' : ['gini']}


In [104]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_data_train_lem, y_train_lem)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini'],
                         'max_depth': [18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                                       28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
                                       38, 39],
                         'max_features': ['auto'],
                         'n_estimators': [130, 131, 132, 133, 134, 135, 136,
                                          137, 138, 139]})

In [105]:
print(CV_rfc.best_score_)
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

0.6247765850338207
{'criterion': 'gini', 'max_depth': 39, 'max_features': 'auto', 'n_estimators': 131}
RandomForestClassifier(max_depth=39, n_estimators=131, random_state=42)


In [113]:
rfc=RandomForestClassifier(random_state=42, criterion='gini', max_depth=39, max_features='auto', n_estimators=131)

In [114]:
rfc.fit(tfidf_data_train_lem, y_train_lem)

RandomForestClassifier(max_depth=39, n_estimators=131, random_state=42)

In [115]:
rf_test_preds_lem = rfc.predict(tfidf_data_test_lem)

In [116]:
rf_acc_score_lem = metrics.accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = metrics.f1_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_precision_score_lem = metrics.precision_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_recall_score_lem = metrics.recall_score(y_test_lem, rf_test_preds_lem, average='weighted')
print('Accuracy:', rf_acc_score_lem)
print('Precision:',rf_precision_score_lem)
print('Recall:',rf_recall_score_lem)
print('F1:',rf_f1_score_lem)

Accuracy: 0.5920835623969214
Precision: 0.6625996562416478
Recall: 0.5920835623969214
F1: 0.46375544125602014


  _warn_prf(average, modifier, msg_start, len(result))
