### **Import modules and read files**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\12156\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\12156\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\12156\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\12156\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# from google.colab import files 
# uploaded = files.upload()

In [3]:
# read tweet data
tweet_data = pd.read_csv('sentiment_analysis.csv')


In [4]:
tweet_data.head()

Unnamed: 0,ID,text,label
0,7.68098e+17,Josh Jenkins is looking forward to TAB Breeder...,1
1,7.68098e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1
2,7.68098e+17,"RT @PEPalerts: This September, @YESmag is taki...",1
3,7.68098e+17,"RT @david_gaibis: Newly painted walls, thanks ...",1
4,7.68098e+17,RT @CedricFeschotte: Excited to announce: as o...,1


In [5]:
tweet_data['text'].isnull().sum()

0

### **Data cleaning**

In this section, the dataset provided (*`sentiment_analysis.csv`*) will be cleaned to remove html tags, attributes, mentions, URL, stop words, etc. In addition, all words will be converted to lowercase.

**Defining functions to clean dataset**

In [6]:
def parser(text):
  """
    removes html tags and attributes using beautifulSoup html.parser,
    returns output as text
  """
  soup = BeautifulSoup(text,'html.parser')
  return soup.get_text()

In [7]:
def normalize(text):
  """
    normalizes unicode character to regular text
  """
  # read ascii characters using NFKD method, then decode back to string
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
  return text

In [8]:
def remove_URL(text):
    text = re.sub(r'http\S+', '', text) 
    text = re.sub(r'www\S+', '', text)
    return(text)

In [9]:
def remove_mentions(text):
    
    #for usernames with two "_"
    text = re.sub(r'@[A-Za-z0-9]+_[A-Za-z0-9]+_[A-Za-z0-9]+','',text) 
    
    #for usernames with one "_"
    text = re.sub(r'@[A-Za-z0-9]+_[A-Za-z0-9]+','',text)
    
    #for usernames with no "_"
    text = re.sub(r'@[A-Za-z0-9]+','',text)

    return text

In [10]:
def remove_spaces(text):
    text = re.sub(r'\s+',' ',text)
    text = re.sub(r'-','',text)
    
    return text

In [11]:
def remove_nonletter(text):
    """
    removes any item that is not in the a-z or A-Z
    """
    text = re.sub(r'[^a-zA-Z ]+', ' ', text)
    return text

**Cleaning**

In [12]:
# filter tweet text
tweet_text = tweet_data["text"]
tweet_text.head()

0    Josh Jenkins is looking forward to TAB Breeder...
1    RT @MianUsmanJaved: Congratulations Pakistan o...
2    RT @PEPalerts: This September, @YESmag is taki...
3    RT @david_gaibis: Newly painted walls, thanks ...
4    RT @CedricFeschotte: Excited to announce: as o...
Name: text, dtype: object

In [13]:
# convert tweet text to lowercase
tweet_text_lower = tweet_text.astype(str).str.lower()
tweet_text_lower.head()

0    josh jenkins is looking forward to tab breeder...
1    rt @mianusmanjaved: congratulations pakistan o...
2    rt @pepalerts: this september, @yesmag is taki...
3    rt @david_gaibis: newly painted walls, thanks ...
4    rt @cedricfeschotte: excited to announce: as o...
Name: text, dtype: object

In [14]:
# clean tweet text using created functions
tweet_text_cleaned = tweet_text_lower.apply(parser)
tweet_text_cleaned = tweet_text_cleaned.apply(normalize)
tweet_text_cleaned = tweet_text_cleaned.apply(remove_URL)
tweet_text_cleaned = tweet_text_cleaned.apply(remove_mentions)
tweet_text_cleaned = tweet_text_cleaned.apply(remove_spaces)
tweet_text_cleaned = tweet_text_cleaned.apply(remove_nonletter)

In [15]:
tweet_text_cleaned.head()

0    josh jenkins is looking forward to tab breeder...
1    rt   congratulations pakistan on becoming  no ...
2    rt   this september  is taking you to maine me...
3    rt   newly painted walls  thanks a million to ...
4    rt   excited to announce  as of july   feschot...
Name: text, dtype: object

In [16]:
# concat regular tweet text with cleaned text
tweet_data_2 = pd.concat([tweet_data, tweet_text_cleaned], axis=1)
tweet_data_2.columns.values[-1] = 'cleaned_text'
tweet_data_2.head()

Unnamed: 0,ID,text,label,cleaned_text
0,7.68098e+17,Josh Jenkins is looking forward to TAB Breeder...,1,josh jenkins is looking forward to tab breeder...
1,7.68098e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1,rt congratulations pakistan on becoming no ...
2,7.68098e+17,"RT @PEPalerts: This September, @YESmag is taki...",1,rt this september is taking you to maine me...
3,7.68098e+17,"RT @david_gaibis: Newly painted walls, thanks ...",1,rt newly painted walls thanks a million to ...
4,7.68098e+17,RT @CedricFeschotte: Excited to announce: as o...,1,rt excited to announce as of july feschot...


**Remove stop words**

In this subsection, I will be removing stop words in the tweet text, they are words that do not add any meaning to the sentence such as a, an, the, this, of, etc. I will be doing this using the list of stop words from **NLTK** library

In [17]:
# define set of nltk stopwords
stop_words = set(stopwords.words('english'))

#adding new stopwords
stop_words.add("i'm")
stop_words.add("they're")
stop_words.add("thats")
stop_words.add("tho")
stop_words.add("also")
stop_words.add("rt")

# create new column for tweet without stop words
tweet_data_2['tweets_without_stopwords'] = tweet_data_2['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
tweet_data_2.head()

Unnamed: 0,ID,text,label,cleaned_text,tweets_without_stopwords
0,7.68098e+17,Josh Jenkins is looking forward to TAB Breeder...,1,josh jenkins is looking forward to tab breeder...,josh jenkins looking forward tab breeders crow...
1,7.68098e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1,rt congratulations pakistan on becoming no ...,congratulations pakistan becoming testteam wor...
2,7.68098e+17,"RT @PEPalerts: This September, @YESmag is taki...",1,rt this september is taking you to maine me...,september taking maine mendozas surprise thank...
3,7.68098e+17,"RT @david_gaibis: Newly painted walls, thanks ...",1,rt newly painted walls thanks a million to ...,newly painted walls thanks million custodial p...
4,7.68098e+17,RT @CedricFeschotte: Excited to announce: as o...,1,rt excited to announce as of july feschot...,excited announce july feschotte lab relocating...


**Perform lemmatization**

Lemmatization is the process of grouping together words with the same root meaning into its base word for example leafs, leaves are grouped into leaf. I will perform this with the **WordNetLemmatizer()** class in NLTK lib.

In [18]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatizing(text):
    words_in_each_tweet = word_tokenize(text)
    final = [lemmatizer.lemmatize(word) for word in words_in_each_tweet]
    lemmatized_tweets = ' '.join(final)
    return lemmatized_tweets

In [19]:
tweet_data_2['tweets_with_lemmer'] = tweet_data_2['tweets_without_stopwords'].apply(lemmatizing)
tweet_data_2.head()

Unnamed: 0,ID,text,label,cleaned_text,tweets_without_stopwords,tweets_with_lemmer
0,7.68098e+17,Josh Jenkins is looking forward to TAB Breeder...,1,josh jenkins is looking forward to tab breeder...,josh jenkins looking forward tab breeders crow...,josh jenkins looking forward tab breeder crown...
1,7.68098e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1,rt congratulations pakistan on becoming no ...,congratulations pakistan becoming testteam wor...,congratulation pakistan becoming testteam worl...
2,7.68098e+17,"RT @PEPalerts: This September, @YESmag is taki...",1,rt this september is taking you to maine me...,september taking maine mendozas surprise thank...,september taking maine mendozas surprise thank...
3,7.68098e+17,"RT @david_gaibis: Newly painted walls, thanks ...",1,rt newly painted walls thanks a million to ...,newly painted walls thanks million custodial p...,newly painted wall thanks million custodial pa...
4,7.68098e+17,RT @CedricFeschotte: Excited to announce: as o...,1,rt excited to announce as of july feschot...,excited announce july feschotte lab relocating...,excited announce july feschotte lab relocating...


In [20]:
tweet_data_2.tweets_with_lemmer

0         josh jenkins looking forward tab breeder crown...
1         congratulation pakistan becoming testteam worl...
2         september taking maine mendozas surprise thank...
3         newly painted wall thanks million custodial pa...
4         excited announce july feschotte lab relocating...
                                ...                        
550386                                     stop watching mm
550387    poor old tom odell look like would know wrong ...
550388           antsmasher smashed ant awesome game hjfjfi
550389                        morning girl wonderful friday
550390      bixbeat mixtape vol great artiste join movement
Name: tweets_with_lemmer, Length: 550391, dtype: object

### **Feature Engineering**

In this section numerical features will be engineered from the lemmatized tweet text using **TF-IDF** (Term Frequency-Inverse Document Frequency) tecnique. 

TF-IDF is used to assign importance weight to each word in a document based on how frequent it appears in the document, words with high frequency have higher weigths (**TF**). On the otherhand, words are later down weighted based on how frequent they appear in a corpus (**IDF**). In our case each tweet text is considered as a **document** and the entire dataset is the **corpus**.

I will perform this using the **TfidfVectorizer()** class in **sklearn**

In [21]:
##COUNT VECTORIZER 

In [22]:
##TERM FREQUENCY VECTORIZER
# TF-IDF vectorizer to create a 1D vector for each tweet
tweet_data_cleaned = tweet_data_2.tweets_with_lemmer



tfidf_vectorizer = TfidfVectorizer(max_features = 600,
                                      norm = 'l2',
                                      use_idf = True,
                                      smooth_idf = True)


tfidf_vec = tfidf_vectorizer.fit_transform(tweet_data_cleaned.values.astype('U'))
tfidf_array = tfidf_vec.toarray()

# encodes tweet data
TFIDF_features = pd.DataFrame(data=tfidf_array, columns = tfidf_vectorizer.get_feature_names_out())
TFIDF_features.head()

Unnamed: 0,absolutely,account,actually,ad,adorable,ago,album,almost,already,always,...,wow,wrong,wtf,ya,yeah,year,yes,yesterday,yet,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#!pip install xgboost

In [24]:
#MODEL IMPLEMENTATION https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier #Classification 

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
import time


In [25]:
#Dataset and Target 
X = TFIDF_features
y = tweet_data.label

#Data Spltting
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
#MODEL IMPLEMENTATION
t_ = time.time()

Logistic_model = LogisticRegression()
Logistic_model.fit(x_train, y_train)
score_logistic = Logistic_model.score(x_test, y_test)

t1 = time.time()
log_time = t1-t_

In [27]:
t_ = time.time()

NB_model = GaussianNB()
NB_model.fit(x_train, y_train)
score_nb = NB_model.score(x_test, y_test)

t2 = time.time()
nb_time = t2-t_

In [28]:
t_ = time.time()

DT_model = DecisionTreeClassifier(random_state=0)
DT_model.fit(x_train, y_train)
score_dt = DT_model.score(x_test, y_test)

t3 = time.time()
dt_time = t3-t_

In [29]:
t_ = time.time()

RF_model = RandomForestClassifier()
RF_model.fit(x_train, y_train)
score_rf = RF_model.score(x_test, y_test)

t4 = time.time()
rf_time = t4-t_

In [30]:
t_ = time.time()

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(x_train, y_train)
score_xgb = xgb_model.score(x_test, y_test)

t5 = time.time()
xgb_time = t5-t_

In [31]:
t_ = time.time()

NN_model = MLPClassifier()
NN_model.fit(x_train, y_train)
score_NN = NN_model.score(x_test,y_test)

t6 = time.time()
nn_time = t6-t_

In [32]:
# SVM_model = SVC()
# SVM_model.fit(x_train, y_train)
# score_svm = SVM_model.score(x_test, y_test)

In [33]:
# KNN_model = KNeighborsClassifier()
# KNN_model.fit(x_train, y_train)
# score_knn = KNN_model.score(x_test, y_test)


In [34]:
#Display Results 
model_list = ['Logistic Regression', 'Naive Bayes', 'Decision Trees', 'Random Forest', 'XG Boost' ,'Neural Network']    # ,'SVM', KNN']
accuracy_list = [score_logistic,score_nb,score_dt,score_rf, score_xgb,score_NN]                 # ,score_svm ,score_knn]
time_list = [log_time, nb_time, dt_time, rf_time, xgb_time, nn_time]

table = {'Classification Model': model_list,
          'Accuracy': accuracy_list,
        'Implementation time': time_list}
table = pd.DataFrame(table)

#Best model is xx 

In [35]:
table

Unnamed: 0,Classification Model,Accuracy,Implementation time
0,Logistic Regression,0.925211,30.731428
1,Naive Bayes,0.86251,7.792603
2,Decision Trees,0.917871,746.291499
3,Random Forest,0.931322,1278.720651
4,XG Boost,0.916417,150.459553
5,Neural Network,0.922722,1148.709273


In [36]:
# Hyper parameter tuning best model: grid search CV -- accuracy
grid_values = {'criterion':['gini','entropy','log'],'bootstrap':[True,False]}
               #,'n_estimators':[80,100],'max_features':['sqrt','log2','None']}
    
grid = GridSearchCV(RF_model, param_grid = grid_values,cv=5,scoring = 'accuracy', refit = True, verbose = 3)
best_result = grid.fit(x_train, y_train) #cross-validation dataset

#Best hyperparameters are xx

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ....bootstrap=True, criterion=gini;, score=0.929 total time=15.8min
[CV 2/5] END ....bootstrap=True, criterion=gini;, score=0.932 total time=15.4min
[CV 3/5] END ....bootstrap=True, criterion=gini;, score=0.930 total time=15.3min
[CV 4/5] END ....bootstrap=True, criterion=gini;, score=0.929 total time=15.3min
[CV 5/5] END ....bootstrap=True, criterion=gini;, score=0.931 total time=15.4min
[CV 1/5] END .bootstrap=True, criterion=entropy;, score=0.930 total time=14.7min
[CV 2/5] END .bootstrap=True, criterion=entropy;, score=0.933 total time=14.6min
[CV 3/5] END .bootstrap=True, criterion=entropy;, score=0.931 total time=14.6min
[CV 4/5] END .bootstrap=True, criterion=entropy;, score=0.930 total time=14.7min
[CV 5/5] END .bootstrap=True, criterion=entropy;, score=0.931 total time=14.7min
[CV 1/5] END .......bootstrap=True, criterion=log;, score=nan total time=   0.7s
[CV 2/5] END .......bootstrap=True, criterion=log

In [37]:
# Hyper parameter tuning best model: grid search CV -- f1
grid_values = {'criterion':['gini','entropy','log'],'bootstrap':[True,False]}
               #,'n_estimators':[80,100],'max_features':['sqrt','log2','None']}
    
grid = GridSearchCV(RF_model, param_grid = grid_values,cv=5,scoring = 'f1', refit = True, verbose = 3)
best_result = grid.fit(x_train, y_train) #cross-validation dataset



Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ....bootstrap=True, criterion=gini;, score=0.947 total time=19.4min
[CV 2/5] END ....bootstrap=True, criterion=gini;, score=0.950 total time=15.4min
[CV 3/5] END ....bootstrap=True, criterion=gini;, score=0.948 total time=15.3min
[CV 4/5] END ....bootstrap=True, criterion=gini;, score=0.947 total time=15.4min
[CV 5/5] END ....bootstrap=True, criterion=gini;, score=0.948 total time=15.5min
[CV 1/5] END .bootstrap=True, criterion=entropy;, score=0.947 total time=15.6min
[CV 2/5] END .bootstrap=True, criterion=entropy;, score=0.950 total time=16.9min
[CV 3/5] END .bootstrap=True, criterion=entropy;, score=0.948 total time=17.2min
[CV 4/5] END .bootstrap=True, criterion=entropy;, score=0.948 total time=17.0min
[CV 5/5] END .bootstrap=True, criterion=entropy;, score=0.948 total time=17.1min
[CV 1/5] END .......bootstrap=True, criterion=log;, score=nan total time=   0.7s
[CV 2/5] END .......bootstrap=True, criterion=log

In [40]:
criterion = ['gini','entropy','log']
bootstrap = [True, False, None] 


data = [['gini',True, np.mean([0.929,0.932,0.930,0.929,0.931]),np.mean([0.947,0.950,0.948,0.947,0.948]),np.mean([15.8,15.4,15.3,15.3,15.4])],
        ['entropy',True, np.mean([0.930,0.933,0.931,0.930,0.931]),np.mean([0.947,0.950,0.948,0.948,0.948]),np.mean([14.7,14.6,14.6,14.7,14.7])],
        ['log',True,'Nan','Nan', np.mean([0.7,0.7,0.7,0.7,0.7])],
        
        ['gini',False, np.mean([0.928,0.931,0.928,0.928,0.929]),np.mean([0.946,0.948,0.946,0.946,0.946]),np.mean([25.2,25.2,25.2,25.3,25.4])],
        ['entropy',False, np.mean([0.928,0.932,0.928,0.928,0.929]),np.mean([0.946,0.948,0.947,0.946,0.947]),np.mean([23.7,23.7,23.6,23.8,24])],
        ['log',False,'Nan','Nan',np.mean([0.7,0.7,0.7,0.7,0.7])],
       
       ]
df = pd.DataFrame(data)
df = pd.DataFrame(data, columns=['Criterion', 'Bootstrap', 'Mean Accuracy', 'Mean F1', 'Mean Runtime'])
df

#Best model  is Random Forest - Critertion = Entropy, Bootstrap = True

Unnamed: 0,Criterion,Bootstrap,Mean Accuracy,Mean F1,Mean Runtime
0,gini,True,0.9302,0.948,15.44
1,entropy,True,0.931,0.9482,14.66
2,log,True,Nan,Nan,0.7
3,gini,False,0.9288,0.9464,25.26
4,entropy,False,0.929,0.9468,23.76
5,log,False,Nan,Nan,0.7


In [45]:
#!pip install mlxtend
# import joblib
# import sys
# sys.modules['sklearn.externals.joblib'] = joblib
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# from sklearn.externals import joblib

# loading dependecy
import pickle
#from sklearn.externals import joblib

# saving our model
joblib.dump(RF_model , 'RF_model_jlib')

['RF_model_jlib']