In [1]:
# Import libraries that we are going to use in the project

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import csv
from sklearn.utils import shuffle

# We will be using two datasets, the first one is used on all classifiers, while the second one will be used for Naive Bayes 
# Only, because its extremely large

# Our first dataset which is a sample dataset taken from Kaggle and Sentiment140 and its used for highly complex classification
# algorithms that take forever to excute in large datasets.This sample was taken with an equal distribution of the positive 
# and negative sentiments and will be used in all classifiers 

data =  pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\data1.csv', skiprows=0, low_memory=False) 
# Shuffle the rows of the dataset 
data = shuffle(data)


# The dataset contains 1578612 tweets coming from two sources: Kaggle and Sentiment140. The Sentiment column correspond 
# to our label class taking a binary value, 0 if the tweet is negative, 1 if the tweet is positive.
data2= pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\data2.csv', skiprows=0, low_memory=False) 

# Reindex the data frames and drop the column added by the reset_index function
data.reset_index(drop=True, inplace=True)
data2.reset_index(drop=True, inplace=True)


# Set max_colwidth to 140 in order to fully see the tweet
pd.set_option('max_colwidth', 140)


In [None]:
# Display the last 10 rows of the first dataset
data.tail(10)

In [None]:
# Drop unneeded columns in the first dataset
data=data.drop(['Index'], axis = 1)
data

In [None]:
#Display the first 10 rows of the second dataset 
data2.head(10)

In [None]:
# Drop unneeded columns in the Second dataset
data2=data2.drop(['ItemID','SentimentSource'], axis = 1)
data2

In [None]:
# You can notice some important points inside the datasets.
# 1) Acronyms for example: "bf" or more complicated "APL". Does it means apple ? Apple (the company) ? In this context we have "friend" after so we could think that he refers to his smartphone and so Apple, but what about if the word "friend" was not here ?
# 2) The presence of sequences of repeated characters such as "Juuuuuuuuuuuuuuuuussssst"
# 3) The presence of emoticons, ":O", "T_T", ":-|" and much more, give insights about user's moods.
# 4) Spelling mistakes like "im gunna" or "mi".
# 5) The precence of nouns such as "TV", "New Moon".
# 6) People also indicate moods, emotions, states, between two such as, \cries*, *hummin*, *sigh*.
# 7) The negation, can't, cannot, don't, haven't that we need to handle.

# And so on. As you can see, it is extremely complex to deal with language and that's why Natural Language Processing where Sentiment Analysis is one of its subtopic is a hot topic and lot of problems are still not solved.

In [None]:
plt.close()
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data.Sentiment.as_matrix(), edgecolor='gray')

# Set plot title
ax.set_title("Histogram of Sentiments of dataset1")

# Set x-axis name
ax.set_xlabel("Sentiment")

# Set y-axis name
ax.set_ylabel("Frequecy")

# Select the first patch (a rectangle, object of class matplotlib.patches.Patch)
# corresponding to negative sentiment and color it
patches[0].set_facecolor("#5d4037")
patches[0].set_label("negative")

# Same for the positive sentiment but in another color.
patches[-1].set_facecolor("#ff9100")
patches[-1].set_label("positive")

# Add legend to a plot     
plt.legend()

In [None]:
# This was our first balanced dataset that was edited and used as a sample from the Sentiment140 dataset


In [None]:
plt.close()
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data2.Sentiment.as_matrix(), edgecolor='gray')

# Set plot title
ax.set_title("Histogram of Sentiments of dataset2")

# Set x-axis name
ax.set_xlabel("Sentiment")

# Set y-axis name
ax.set_ylabel("Frequecy")

# Select the first patch (a rectangle, object of class matplotlib.patches.Patch)
# corresponding to negative sentiment and color it
patches[0].set_facecolor("#5d4037")
patches[0].set_label("negative")

# Same for the positive sentiment but in another color.
patches[-1].set_facecolor("#ff9100")
patches[-1].set_label("positive")

# Add legend to a plot     
plt.legend()

In [None]:
# The second dataset also seems to be really well-balanced between negative and positive sentiment, let's confirm that by displying numeric values,

In [None]:
# Dataset 1
data.Sentiment.value_counts()
# Count of tweets corresponding to the positive and negative sentiments

In [None]:
# Dataset 2
data2.Sentiment.value_counts()
# Count of tweets corresponding to the positive and negative sentiments

In [None]:
#It is important to check if we have duplicates in tweets which is something that arise very often because of the RT (Retweet),
# Show duplicated tweets if exist in Dataset1
len(data[data.duplicated('SentimentText')])

In [None]:
# Show duplicated tweets if exist in Dataset 2
len(data2[data2.duplicated('SentimentText')])

In [None]:
# Display the number of RT in the first dataset
CountofRT = data['SentimentText'].str.contains('RT').value_counts()
CountofRT

In [None]:
# Display the number of RT in the second dataset
CountofRT2 = data2['SentimentText'].str.contains('RT').value_counts()
CountofRT2

In [None]:
# Thats a very good news to have some retweets in our dataset but with zero duplications and thats a very 
# good thing when it comes to training our classifier

In [None]:
# Resources

# To have a good preprocessing to our data, we will be using some beneficial resources

# emoticon dictionary regrouping 132 of the most used emoticons in western with their sentiment, negative or positive.
# An acronym dictionary of 5465 acronyms with their translation
# A stop word dictionary corresponding to words which are filtered out before or after processing of natural language data because they are not useful in our case.
# A positive and negative word dictionaries.
# A negative contractions and auxiliaries dictionary which will be used to detect negation in a given tweet

In [None]:
# Load Smileys Dataset
emoticons = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Smileys.csv', skiprows=0, low_memory=False)
positive_emoticons = emoticons[emoticons.Sentiment == 1]
negative_emoticons = emoticons[emoticons.Sentiment == 0]
emoticons.head(5)

In [None]:
# Load Acronyms Dataset
acronyms = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Acronyms.csv', skiprows=0, low_memory=False)
acronyms.tail(5)

In [None]:
# Load Stopwords Dataset
stops = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Stopwords.csv', skiprows=0, low_memory=False)
stops.columns = ['Word']
stops.head(5)

In [None]:
# The resources showed above are mainly used only for the preprocessing part. 


# Another resource that we are going to use is a lexicon which corresponds to a list of words where each word is associated with its polarity, positive or negative.
# The lexicon is divided into two distinct files, one for positive words, containing 2005 entries and the other for negative words containing 4782 entries.

In [None]:
# Load Positive Words Dataset
positive_words = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Positive-words.csv', skiprows=0, low_memory=False ,sep='\t')
positive_words.columns = ['Word', 'Sentiment']
positive_words.tail(5)

In [None]:
# Load Negative Words Dataset
negative_words = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Negative-words.csv', skiprows=0, low_memory=False ,sep='\t',encoding= 'unicode_escape')
negative_words.columns = ['Word', 'Sentiment']
negative_words.head(5)

In [None]:
# Load Negations Dataset
negation_words = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Negation.csv', skiprows=0, low_memory=False)
negation_words.head(5)

In [None]:
# Preprocessing


# One of the most important parts that is going to be crucial for the learning part is the preprocessing of the data. Indeed as they are, we can't just use a learning algorithm because the given result would be highly biased due to the inconsistency of the data.

# To do this we are going to pass our data through these different steps:

# 1) Replace all emoticons by their sentiment polarity ||pos||/||neg|| using the emoticon dictionary.
# 2) Replace all URLs with a tag ||url||.
# 3) Remove Unicode characters.
# 4) Decode HTML entities.
# 5) Reduce all letters to lowercase (We should take care of proper nouns but for simplicity we will lower them as well) (After emoticons because they can use upper case letters)
# 6) Replace all usernames/targets @ with ||target||.
# 7) Replace all acronyms with their translation.
# 8) Replace all negations (e.g: not, no, never) by tag ||not||.
# 9) Replace a sequence of repeated characters by two characters (e.g: "helloooo" = "helloo") to keep the emphasized usage of the word.

# Not to forget the imporatnce of the tagging and lexicons phases inside the preprocessing we are doing here.
# In addition, those two are extremely important in terms of NLP and Sentiment Analysis.


# All these techniques will help us reach better results in the validation and testing phases.


In [None]:
import re # Regular Expressions

# Creating the functions to detect and replace emoticons for the first dataset

def make_emoticon_pattern(emoticons):
    pattern = "|".join(map(re.escape, emoticons.Smiley))
    pattern = "(?<=\s)(" + pattern + ")(?=\s)"
    return pattern

def find_with_pattern(pattern, replace=False, tag=None):
    if replace and tag == None:
        raise Exception("Parameter error", "If replace=True you should add the tag by which the pattern will be replaced")
    regex = re.compile(pattern)
    if replace:
        return data.SentimentText.apply(lambda tweet: re.sub(pattern, tag, " " + tweet + " "))
    return data.SentimentText.apply(lambda tweet: re.findall(pattern, " " + tweet + " "))

# Applying the functions by relying on the positive and negative emoticons retrieved from the resources
pos_emoticons_found = find_with_pattern(make_emoticon_pattern(positive_emoticons))
neg_emoticons_found = find_with_pattern(make_emoticon_pattern(negative_emoticons))

# Searching through the dataset to find the number of positive and negative emoticons presented
nb_pos_emoticons = len(pos_emoticons_found[pos_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])
nb_neg_emoticons = len(neg_emoticons_found[neg_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])
print ("Number of positive emoticons in the first dataset: " + str(nb_pos_emoticons))
print("Number of negative emoticons in the first dataset: " + str(nb_neg_emoticons))

In [None]:
# Performing the replacement function to the first dataset
data.SentimentText = find_with_pattern(make_emoticon_pattern(positive_emoticons), True, '||pos||')
data.SentimentText = find_with_pattern(make_emoticon_pattern(negative_emoticons), True, '||neg||')

In [None]:
# Same procedure for the second dataset
def make_emoticon_pattern2(emoticons):
    pattern = "|".join(map(re.escape, emoticons.Smiley))
    pattern = "(?<=\s)(" + pattern + ")(?=\s)"
    return pattern

def find_with_pattern2(pattern, replace=False, tag=None):
    if replace and tag == None:
        raise Exception("Parameter error", "If replace=True you should add the tag by which the pattern will be replaced")
    regex = re.compile(pattern)
    if replace:
        return data2.SentimentText.apply(lambda tweet: re.sub(pattern, tag, " " + tweet + " "))
    return data2.SentimentText.apply(lambda tweet: re.findall(pattern, " " + tweet + " "))

pos_emoticons_found = find_with_pattern2(make_emoticon_pattern2(positive_emoticons))
neg_emoticons_found = find_with_pattern2(make_emoticon_pattern2(negative_emoticons))

nb_pos_emoticons = len(pos_emoticons_found[pos_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])
nb_neg_emoticons = len(neg_emoticons_found[neg_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])
print ("Number of positive emoticons in the second dataset: " + str(nb_pos_emoticons))
print("Number of negative emoticons in the second dataset: " + str(nb_neg_emoticons))

In [None]:
# Performing the replacement function to the second dataset


data2.SentimentText = find_with_pattern2(make_emoticon_pattern2(positive_emoticons), True, '||pos||')
data2.SentimentText = find_with_pattern2(make_emoticon_pattern2(negative_emoticons), True, '||neg||')
data2.head(10)

In [None]:
# Replacing the URLS


# Using the same method as for emoticons, we find all urls in each tweet and replace them by the tag ||url||

In [None]:
# Printing URLS

# First Dataset 
pattern_url = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
url_found = find_with_pattern(pattern_url)
print ("Number of urls of the first dataset: " + str(len(url_found[url_found.map(lambda urls : len(urls) > 0)])))

In [None]:
# Second Dataset
pattern_url2 = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
url_found2 = find_with_pattern2(pattern_url2)
print ("Number of urls of the second dataset: " + str(len(url_found2[url_found2.map(lambda urls : len(urls) > 0)])))

In [None]:
# Some URL Examples before preprocessing
data2[50:60]

In [None]:
# Replacing all URLS with ||url|| Expression for both datasets
data.SentimentText = find_with_pattern(pattern_url, True, '||url||')
data2.SentimentText = find_with_pattern2(pattern_url2, True, '||url||')
# URLS after preprocessing
data2[50:60]

In [None]:
# We remove unicode characteres since they can cause problems during the tokenization process. We keep only ASCII characteres.


# Some examples of unicode character before preprocessing of dataset2
data2[1578592:1578602]

In [None]:
# Remove Unicode characters keeping asci


# If you are using python 3 you should use both functions
def remove_unicode(text):
    try:
        text = text.encode('ascii','ignore')
    except UnicodeDecodeError:
        pass
    return text

def remove_unicode2(text):
    try:
        text = text.decode('unicode_escape')
    except UnicodeDecodeError:
        pass
    return text

# Removing unicode character of both datasets
data.SentimentText = data.SentimentText.apply(lambda tweet: remove_unicode(tweet))
data2.SentimentText = data2.SentimentText.apply(lambda tweet: remove_unicode(tweet))
data.SentimentText = data.SentimentText.apply(lambda tweet: remove_unicode2(tweet))
data2.SentimentText = data2.SentimentText.apply(lambda tweet: remove_unicode2(tweet))
# After removing Unicode characters
data2[1578592:1578602]

In [None]:
# Before Decoding HTML entities
data2.SentimentText[599982]

In [None]:
# Simply decode HTML entities
import html
# Convert tweets in unicode utf-8 to avoid mixing unicode with ascii and causing an error during unescape
data.SentimentText  = data.SentimentText.apply(lambda tweet: html.unescape(str(tweet)))
data2.SentimentText  = data2.SentimentText.apply(lambda tweet: html.unescape(str(tweet)))
# After decoding HTML entities
data2.SentimentText[599982]

In [None]:
# Reducing all letters to lower case 

#This is part is extremely simple, we just transform all tweets to lower case in order to make easier the next operations with the acronym and stop dictionaries and more generally, to make easier comparisons. We should take care of proper noun but for simplicity we skip this.

# Example before doing the lower case function
data2.head(10)


In [None]:
# Reduce all letters to lower case
data.SentimentText = data.SentimentText.str.lower()
data2.SentimentText = data2.SentimentText.str.lower()

# After preprocessing of letters
data2.head(10)

In [None]:
# Replacing all usernames @ with the tag ||target||


# Since we don't need to take into account usernames in order to determine the sentiment of a tweet we replace them by the tag ||target||.

# Before replacement
data2[45:55]


In [None]:
# Now we want to check how many mentions we have in both datasets

# NLP expression that is related to all string values that are @ followed by any names or string values
pattern_usernames = "@\w{1,}"
usernames_found = find_with_pattern(pattern_usernames)
usernames_found2 = find_with_pattern2(pattern_usernames)

In [None]:
# Number of targets in the first dataset
len(data.SentimentText[usernames_found.apply(lambda usernames : len(usernames) > 0)])


In [None]:
# Number of targets in the second dataset
len(data2.SentimentText[usernames_found2.apply(lambda usernames : len(usernames) > 0)])

In [None]:
# Replace all usernames/targets @ with the tag ||target||
data.SentimentText = find_with_pattern(pattern_usernames, True, '||target||')
data2.SentimentText = find_with_pattern2(pattern_usernames, True, '||target||')

# Sample rows after replacement
data2[45:55]

In [None]:
#7) Replace all acronyms with their translation

#Next, we replace all acronyms with their translation using the acronym dictionary.
#At this point, tweets are going to be tokenized by getting rid of the punctuation and using split in order to do the process really fast. We could use nltk.tokenizer but it is definitly much much slower (also much more accurate but its not a severe problem).
#Even though replacements will not be perfect, a simple example is the acronym "im" meaning "instant message". It would not be surprising that in most of the cases, "im" means "I am". For that, some improvements will be done later on to enhance our results


In [None]:
from collections import Counter

# Create a dictionary of acronym which will be used to get translations
acronym_dictionary = dict(zip(acronyms.Acronym, acronyms.Translation))

# Will be used to get rid of the punctuation in tweets (does not include | since we use it for our tokens and ' 
# to take care of don't, can't)
punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{}~'

# Frequency table for acronyms
acronyms_counter = Counter()

# Loop on acronyms to replace those matched in the tweet by the corresponding translations
# Return the tweet and the acronyms used
def acronym_to_translation(tweet, acronyms_counter):
    table = str.maketrans(punctuation," " * len(punctuation))
    tweet = str(tweet).translate(table)
    words = tweet.split()
    new_words = []
    for i, word in enumerate(words):
        if acronym_dictionary.__contains__(word):
            acronyms_counter[word] += 1
            new_words.extend(acronym_dictionary[word].split())
        else:
            new_words.append(word)
    return new_words

data.SentimentText = data.SentimentText.apply(lambda tweet: acronym_to_translation(str(tweet), acronyms_counter))

# Get and display top20 acronyms of the first dataset
top20acronyms = acronyms_counter.most_common(20)
top20acronyms

In [None]:
# Same process for the second dataset

# Create a dictionary of acronym which will be used to get translations
acronym_dictionary = dict(zip(acronyms.Acronym, acronyms.Translation))

# Will be used to get rid of the punctuation in tweets (does not include | since we use it for our tokens and ' 
# to take care of don't, can't)
punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{}~'

# Frequency table for acronyms
acronyms_counter2 = Counter()

# Loop on acronyms to replace those matched in the tweet by the corresponding translations
# Return the tweet and the acronyms used
def acronym_to_translation2(tweet, acronyms_counter2):
    table = str.maketrans(punctuation," " * len(punctuation))
    tweet = str(tweet).translate(table)
    words = tweet.split()
    new_words = []
    for i, word in enumerate(words):
        if acronym_dictionary.__contains__(word):
            acronyms_counter2[word] += 1
            new_words.extend(acronym_dictionary[word].split())
        else:
            new_words.append(word)
    return new_words

data2.SentimentText = data2.SentimentText.apply(lambda tweet: acronym_to_translation2(str(tweet ), acronyms_counter2))

# Get and display top20 acronyms of the second dataset
top20acronyms2 = acronyms_counter2.most_common(20)
top20acronyms2

In [None]:
# Just to better visualize the top 20 acronym
print("Dataset 1:")
for i, (acronym, value) in enumerate(top20acronyms):
    print (str(i + 1) + ") " + acronym + " => " + acronym_dictionary[acronym] + " : " + str(value) )

In [None]:
# Just to better visualize the top 20 acronym
print('Dataset 2:')
for i, (acronym, value) in enumerate(top20acronyms2):
    print (str(i + 1) + ") " + acronym + " => " + acronym_dictionary[acronym] + " : " + str(value) )

In [None]:
# With a bar plot
plt.close()
top20acronym_keys = [x[0] for x in top20acronyms]
top20acronym_values = [x[1] for x in top20acronyms]
indexes = np.arange(len(top20acronym_keys))
width = 0.7
plt.bar(indexes, top20acronym_values, width)
plt.title('Top 20 acronyms in Dataset 1')
plt.xticks(indexes + width * 0.5, top20acronym_keys, rotation="vertical")

In [None]:
# With a bar plot
plt.close()
top20acronym_keys2 = [x[0] for x in top20acronyms2]
top20acronym_values2 = [x[1] for x in top20acronyms2]
indexes = np.arange(len(top20acronym_keys2))
width = 0.7
plt.bar(indexes, top20acronym_values2, width)
plt.title('Top 20 acronyms in Dataset 2')
plt.xticks(indexes + width * 0.5, top20acronym_keys2, rotation="vertical")

In [None]:
# Replace all negations by tag ||not||

#We replace all negations such as not, no, don't and so on, using the negation dictionary in order to take more or less of sentences like "I don't like it". Here like should not be considered as positive because of the "don't" before. To do so we will replace "don't" by ||not|| and the word like will not be counted as positive.
#In general, each time a negation is encountered, the words followed by the negation word contained in the positive and negative word dictionaries will be reversed, positive becomes negative, negative becomes positive, we will do this when we will try to find positive and negative words..
# Since we replaced the negations and the positive and negative words, the tagging will reduce the harm due to the combination of both

# Before replacement
print (data2.SentimentText[29])

In [None]:
# Transform the dataframe into a dictionary
negation_dictionary = dict(zip(negation_words.Negation, negation_words.Tag))

# Find a negation in a tweet and replace it by its tag
def replace_negation(tweet):
    return [negation_dictionary[word] if negation_dictionary.__contains__(word) else word for word in tweet]
    
# Apply the function on every tweet
data.SentimentText = data.SentimentText.apply(lambda tweet: replace_negation(tweet))
data2.SentimentText = data2.SentimentText.apply(lambda tweet: replace_negation(tweet))
# After replacement
print (data2.SentimentText[29])

In [None]:
#Replace a sequence of repeated characters by two caracters

#There are many words containing repeated sequences of charaters usually used to emphasize a word.
#We are going to reduce the number of repeated charaters in order to potentially reduce the feature space (the words in our case) and keep their emphasized aspect

# Sample data before removing repeated characters
data2[1578604:]

In [None]:
#Replace a sequence of repeated characters by two characters

# We use the corresponding regular expression to detected repeated characters inside a word
pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

# Applying the replacements functions to both datasets
data.SentimentText = data.SentimentText.apply(lambda tweet: reduce_sequence_tweet(tweet))
data2.SentimentText = data2.SentimentText.apply(lambda tweet: reduce_sequence_tweet(tweet))

# After removing repeated characters
data2[1578604:]

In [None]:
#First Dataset
def make_training_test_sets(data):
    
    # Before making the training and test set, we shuffle our data set in order to avoid keeping any order
    data_shuffled = data.iloc[np.random.permutation(len(data))]
    data = data_shuffled.reset_index(drop=True)

    # Join the words back into one string separated by space for each tweet
    data.SentimentText = data.SentimentText.apply(lambda tweet: " ".join(tweet))

    # Separate positive and negative tweets
    positive_tweets = data[data.Sentiment == 1]
    negative_tweets = data[data.Sentiment == 0]

    # Cutoff, 9/10 for training of each sentiment and 1/10 of each sentiment for testing
    positive_tweets_cutoff = int(len(positive_tweets) * (9./10.))
    negative_tweets_cutoff = int(len(negative_tweets) * (9./10.))

    # Make the training and test set
    training_tweets = pd.concat([positive_tweets[:positive_tweets_cutoff], negative_tweets[:negative_tweets_cutoff]])
    test_tweets = pd.concat([positive_tweets[positive_tweets_cutoff:], negative_tweets[negative_tweets_cutoff:]])

    # We suffle the training and test set to break the order of tweets based on their sentiment
    training_tweets = training_tweets.iloc[np.random.permutation(len(training_tweets))].reset_index(drop=True)
    test_tweets = test_tweets.iloc[np.random.permutation(len(test_tweets))].reset_index(drop=True)
    
    return training_tweets, test_tweets

training_tweets, test_tweets = make_training_test_sets(data)

print ("size of training set of first dataset: " + str(len(training_tweets)))
print ("size of test set of first dataset: " + str(len(test_tweets)))

In [None]:
# Second Dataset
def make_training_test_sets2(data2):
    
    # Before making the training and test set, we shuffle our data set in order to avoid keeping any order
    data_shuffled2 = data2.iloc[np.random.permutation(len(data2))]
    data2 =  data_shuffled2.reset_index(drop=True)

    # Join the words back into one string separated by space for each tweet
    data2.SentimentText = data2.SentimentText.apply(lambda tweet: " ".join(tweet))

    # Separate positive and negative tweets
    positive_tweets = data2[data2.Sentiment == 1]
    negative_tweets = data2[data2.Sentiment == 0]

    # Cutoff, 3/4 for training of each sentiment and 1/4 of each sentiment for testing
    positive_tweets_cutoff = int(len(positive_tweets) * (3./4.))
    negative_tweets_cutoff = int(len(negative_tweets) * (3./4.))

    # Make the training and test set
    training_tweets2 = pd.concat([positive_tweets[:positive_tweets_cutoff], negative_tweets[:negative_tweets_cutoff]])
    test_tweets2 = pd.concat([positive_tweets[positive_tweets_cutoff:], negative_tweets[negative_tweets_cutoff:]])

    # We suffle the training and test set to break the order of tweets based on their sentiment
    training_tweets2 = training_tweets2.iloc[np.random.permutation(len(training_tweets2))].reset_index(drop=True)
    test_tweets2 = test_tweets2.iloc[np.random.permutation(len(test_tweets2))].reset_index(drop=True)
    
    return training_tweets2, test_tweets2

training_tweets2, test_tweets2 = make_training_test_sets2(data2)

print ("size of training set of second dataset: " + str(len(training_tweets2)))
print ("size of test set of second dataset: " + str(len(test_tweets2)))

In [None]:
# Now we will start training and validating the data of dataset1 unigram

# We will use Sklearn for kfolding, calculating the metrics: precision, accuracy, recall, f1 score and confusion matrix
# We will be using the famous classification algorithms Multinomial Naive Bayes, KNeighbor, SGD, Logistic regression, Decision tree, and SVM
# we import pickle to save the models as a pickle file, to be used later on for testing
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
import pickle

# Starting with dataset one we will perform all the algorithms before removing stop words and before stemming, and using unigrams only
def classify_unigram(training_tweets, test_tweets, ngram=(1, 1)):
    global models
    models = []
    models.append(('Multi_uni_before_ss', MultinomialNB()))
    models.append(('DT_uni_before_ss', DecisionTreeClassifier()))
    models.append(('KN_uni_before_ss', KNeighborsClassifier()))
    models.append(('SGD_uni_before_ss', SGDClassifier()))
    models.append(('SVM_uni_before_ss', SVC()))
    models.append(('LR_uni_before_ss', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory as a pkl file to be used later in testing
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                # if loop to store the fold with the best F1 score
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        # saving the classifier with the countvectorizer function to be used later on
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score        
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')
        

        
classify_unigram(training_tweets, test_tweets)

In [None]:
# Same procedure for the second dataset using only Multi NB and unigrams
def classify_unigram2(training_tweets2, test_tweets2, ngram=(1, 1)):
    global models
    models = []
    models.append(('Multi_uni_before_ss2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                    
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')
        
classify_unigram2(training_tweets2, test_tweets2)

In [None]:
# Now its unigrams and bigrams of dataset1
def classify_unigrambigram(training_tweets, test_tweets, ngram=(1, 2)):
    global models
    models = []
    models.append(('Multi_unibi_before_ss', MultinomialNB()))
    models.append(('DT_unibi_before_ss', DecisionTreeClassifier()))
    models.append(('KN_unibi_before_ss', KNeighborsClassifier()))
    models.append(('SGD_unibi_before_ss', SGDClassifier()))
    models.append(('SVM_unibi_before_ss', SVC()))
    models.append(('LR_unibi_before_ss', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
               
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigrambigram(training_tweets, test_tweets)

In [None]:
# Unigram and bigram usage for dataset2 
def classify_unigrambigram2(training_tweets2, test_tweets2, ngram=(1, 2)):
    global models
    models = []
    models.append(('Multi_unibi_before_ss2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                    
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')
        
classify_unigrambigram2(training_tweets2, test_tweets2)

In [None]:
# Using bigrams only for dataset1
def classify_bigram(training_tweets, test_tweets, ngram=(2, 2)):
    global models
    models = []
    models.append(('Multi_bi_before_ss', MultinomialNB()))
    models.append(('DT_bi_before_ss', DecisionTreeClassifier()))
    models.append(('KN_bi_before_ss', KNeighborsClassifier()))
    models.append(('SGD_bi_before_ss', SGDClassifier()))
    models.append(('SVM_bi_before_ss', SVC()))
    models.append(('LR_bi_before_ss', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_bigram(training_tweets, test_tweets)

In [None]:
# Using bigrams only for dataset2
def classify_bigram2(training_tweets2, test_tweets2, ngram=(2, 2)):
    global models
    models = []
    models.append(('Multi_bi_before_ss2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                    
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')
        
classify_bigram2(training_tweets2, test_tweets2)

In [None]:
# We build a word frequency table to see which words are the most used
word_frequency_table = Counter()

def count_word(tweet):
    for word in tweet:
        word_frequency_table[word] += 1
    return tweet
# First dataset
data.SentimentText.map(lambda tweet: count_word(tweet))
print('Most frequent words in dataset 1:')
word_frequency_table.most_common()[:20]

In [None]:
# We build a word frequency table to see which words are the most used
word_frequency_table2 = Counter()

def count_word2(tweet):
    for word in tweet:
        word_frequency_table2[word] += 1
    return tweet
# Second dataset
data2.SentimentText.map(lambda tweet: count_word2(tweet))
print('Most frequent words in dataset 2:')
word_frequency_table2.most_common()[:20]

In [None]:
 # list of tags
tags = ['||target||', '||url||', '||pos||', '||neg||', '||not||']

# list of tuples representing tags with their corresponding count in dataset1
tag_counter = [(w, c) for w,c in word_frequency_table.items() if w in tags]
print('Tag counter in dataset 1: ')
print (tag_counter)

In [None]:
 # list of tags
tags = ['||target||', '||url||', '||pos||', '||neg||', '||not||']

# list of tuples representing tags with their corresponding count in dataset2
tag_counter2 = [(w, c) for w,c in word_frequency_table2.items() if w in tags]
print('Tag counter in dataset 2: ')
print (tag_counter2)

In [None]:
plt.close()
tag_counter_keys = [x[0] for x in tag_counter]
tag_counter_values = [x[1] for x in tag_counter]
indexes = np.arange(len(tag_counter_keys))
width = 0.7
plt.bar(indexes, tag_counter_values, width)
plt.title("Counts in Dataset1")
plt.xticks(indexes + width * 0.5, tag_counter_keys, rotation="vertical") 


In [None]:
plt.close()
tag_counter_keys2 = [x[0] for x in tag_counter2]
tag_counter_values2 = [x[1] for x in tag_counter2]
indexes = np.arange(len(tag_counter_keys2))
width = 0.7
plt.bar(indexes, tag_counter_values2, width)
plt.title("Counts in Dataset2")
plt.xticks(indexes + width * 0.5, tag_counter_keys2, rotation="vertical") 


In [None]:
# Transform the dataframe into a dictionary
stopword_dictionary = dict.fromkeys(stops.Word, None)

# Remove stopword from tweets
def remove_stopwords(tweet):
    tweet = [stopword_dictionary[word] if stopword_dictionary.__contains__(word) else word for word in tweet]
    return [word for word in tweet if word]
# Remove stop words in both dataset1 and 2
data.SentimentText = data.SentimentText.apply(lambda tweet: remove_stopwords(tweet))
data2.SentimentText = data2.SentimentText.apply(lambda tweet: remove_stopwords(tweet))

In [None]:
# Most common words after deleting stop words
word_frequency_table = Counter()

# Dataset1
data.SentimentText.map(lambda tweet: count_word(tweet))
print('Most common words after deleting stop words in dataset1: ')
print (word_frequency_table.most_common()[:20])

In [None]:
# Most common words after deleting stop words
word_frequency_table2 = Counter()

#Dataset2
data2.SentimentText.map(lambda tweet: count_word2(tweet))
print('Most common words after deleting stop words in dataset2: ')
print (word_frequency_table2.most_common()[:20])

In [None]:
# Same procedure after removing stopwords
# unigrams only after removing stop words for dataset1
def classify_unigram_nostop(training_tweets, test_tweets, ngram=(1, 1)):
    global models
    models = []
    models.append(('Multi_uni_nostop', MultinomialNB()))
    models.append(('DT_uni_nostop', DecisionTreeClassifier()))
    models.append(('KN_uni_nostop', KNeighborsClassifier()))
    models.append(('SGD_uni_nostop', SGDClassifier()))
    models.append(('SVM_uni_nostop', SVC()))
    models.append(('LR_uni_nostop', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigram_nostop(training_tweets, test_tweets)

In [None]:
# Dataset2 no stop words and unigrams only
def classify_unigram_nostop2(training_tweets2, test_tweets2, ngram=(1, 1)):
    global models
    models = []
    models.append(('Multi_uni_nostop2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigram_nostop2(training_tweets2, test_tweets2)

In [None]:
# Dataset1 nostopwords with unigrams and bigrams
def classify_unigrambigram_nostop(training_tweets, test_tweets, ngram=(1, 2)):
    global models
    models = []
    models.append(('Multi_unibi_nostop', MultinomialNB()))
    models.append(('DT_unibi_nostop', DecisionTreeClassifier()))
    models.append(('KN_unibi_nostop', KNeighborsClassifier()))
    models.append(('SGD_unibi_nostop', SGDClassifier()))
    models.append(('SVM_unibi_nostop', SVC()))
    models.append(('LR_unibi_nostop', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigrambigram_nostop(training_tweets, test_tweets)

In [None]:
# Dataset2 for unigrams and bigrams after removing stopwords
def classify_unigrambigram_nostop2(training_tweets2, test_tweets2, ngram=(1, 2)):
    global models
    models = []
    models.append(('Multi_unibi_nostop2', MultinomialNB()))

    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigrambigram_nostop2(training_tweets2, test_tweets2)

In [None]:
# Dataset1 bigrams only after removing stopwords
def classify_bigram_nostop(training_tweets, test_tweets, ngram=(2, 2)):
    global models
    models = []
    models.append(('Multi_bi_nostop', MultinomialNB()))
    models.append(('DT_bi_nostop', DecisionTreeClassifier()))
    models.append(('KN_bi_nostop', KNeighborsClassifier()))
    models.append(('SGD_bi_nostop', SGDClassifier()))
    models.append(('SVM_bi_nostop', SVC()))
    models.append(('LR_bi_nostop', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_bigram_nostop(training_tweets, test_tweets)

In [None]:
# Dataset2 after removing stopwords and using only bigrams
def classify_bigram_nostop2(training_tweets2, test_tweets2, ngram=(2, 2)):
    global models
    models = []
    models.append(('Multi_bi_nostop2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_bigram_nostop2(training_tweets2, test_tweets2)

In [None]:
# Before Stemming
data2

In [None]:
# Now we use porterstemmer for stemming to check whether our scores will improve or not
import nltk

pstemmer = nltk.PorterStemmer()
def stemming_words(tweet):
    return [pstemmer.stem(word) if word not in tags else word for word in tweet]

# applying the stemming function to both datasets
data.SentimentText = data.SentimentText.apply(lambda tweet: stemming_words(tweet))
data2.SentimentText = data2.SentimentText.apply(lambda tweet: stemming_words(tweet))

In [None]:
# After Stemming
data2

In [None]:
# Dataset1 unigrams only after stemming
def classify_unigram_stem(training_tweets, test_tweets, ngram=(1, 1)):
    global models
    models = []
    models.append(('Multi_uni_stem', MultinomialNB()))
    models.append(('DT_uni_stem', DecisionTreeClassifier()))
    models.append(('KN_uni_stem', KNeighborsClassifier()))
    models.append(('SGD_uni_stem', SGDClassifier()))
    models.append(('SVM_uni_stem', SVC()))
    models.append(('LR_uni_stem', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                        
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigram_stem(training_tweets, test_tweets)

In [None]:
# Dataset2 unigrams after stemming
def classify_unigram_stem2(training_tweets2, test_tweets2, ngram=(1, 1)):
    global models
    models = []
    models.append(('Multi_uni_stem2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigram_stem2(training_tweets2, test_tweets2)

In [None]:
# Unigrams and bigrams after stemming for dataset1
def classify_unigrambigram_stem(training_tweets, test_tweets, ngram=(1, 2)):
    global models
    models = []
    models.append(('Multi_unibi_stem', MultinomialNB()))
    models.append(('DT_unibi_stem', DecisionTreeClassifier()))
    models.append(('KN_unibi_stem', KNeighborsClassifier()))
    models.append(('SGD_unibi_stem', SGDClassifier()))
    models.append(('SVM_unibi_stem', SVC()))
    models.append(('LR_unibi_stem', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigrambigram_stem(training_tweets, test_tweets)

In [None]:
# Unigrams and bigrams after stemming for Dataset2
def classify_unigrambigram_stem2(training_tweets2, test_tweets2, ngram=(1, 2)):
    global models
    models = []
    models.append(('Multi_unibi_stem2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets2)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_unigrambigram_stem2(training_tweets2, test_tweets2)

In [None]:
# Bigrams only for dataset1 after stemming
def classify_bigram_stem(training_tweets, test_tweets, ngram=(2, 2)):
    global models
    models = []
    models.append(('Multi_bi_stem', MultinomialNB()))
    models.append(('DT_bi_stem', DecisionTreeClassifier()))
    models.append(('KN_bi_stem', KNeighborsClassifier()))
    models.append(('SGD_bi_stem', SGDClassifier()))
    models.append(('SVM_bi_stem', SVC()))
    models.append(('LR_bi_stem', LogisticRegression(solver='lbfgs')))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets):
                
                training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_bigram_stem(training_tweets, test_tweets)

In [None]:
# Bigrams only for dataset2 after stemming
def classify_bigram_stem2(training_tweets2, test_tweets2, ngram=(2, 2)):
    global models
    models = []
    models.append(('Multi_bi_stem2', MultinomialNB()))
    results= []
    names= []
    # F1 scores for each fold
    global scores
    scores = []
    
    #for name,model in models:
        
    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n_splits=10, shuffle = True)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])
    for name,model in models:
        print(name)
        score2=0
        for training_indices, validation_indices in k_fold.split(training_tweets2):
                
                training_features = count_vectorizer.fit_transform(training_tweets2.iloc[training_indices]['SentimentText'].values)
                training_labels = training_tweets2.iloc[training_indices]['Sentiment'].values

                validation_features = count_vectorizer.transform(training_tweets2.iloc[validation_indices]['SentimentText'].values)
                validation_labels = training_tweets2.iloc[validation_indices]['Sentiment'].values

                classifier = model
                classifier.fit(training_features, training_labels)
                validation_predictions = classifier.predict(validation_features)

                confusion += confusion_matrix(validation_labels, validation_predictions)
                score = f1_score(validation_labels, validation_predictions)
                precision=precision_score(validation_labels, validation_predictions)
                recall=recall_score(validation_labels,validation_predictions)
                accuracy=accuracy_score(validation_labels, validation_predictions)
                scores.append(score)
                # Save to file in the current working directory
                pkl_filename = "C:/Users/USER/Desktop/BE/Models/"+name+".pkl"
                if score>score2:
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(classifier, file)
                        pickle.dump(count_vectorizer, file)
                    score2=score
                
        print('\n')
        print ('Total tweets classified: ' + str(len(training_tweets)))
        print('\n')
        print ('F1 Score: ' +  (str(sum(scores)/len(scores)*100)+'%'))
        print ('Accuracy: ' + str(accuracy*100)+ '%')
        print ('Precision: ' + str(precision*100)+ '%')
        print ('Recall: ' + str(recall*100)+ '%')
        print('\n')
        print ('Confusion matrix:')
        print (confusion)
        print('\n')
        print('-------------------------------------')
        print('\n')

        
classify_bigram_stem2(training_tweets2, test_tweets2)

In [None]:
# Import Libaries 
import tweepy
from textblob import TextBlob
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
plt.style.use('fivethirtyeight')



# Load Smileys Dataset
emoticons = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Smileys.csv', skiprows=0, low_memory=False)
positive_emoticons = emoticons[emoticons.Sentiment == 1]
negative_emoticons = emoticons[emoticons.Sentiment == 0]
# Load Acronyms Dataset
acronyms = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Acronyms.csv', skiprows=0, low_memory=False)
# Load Stopwords Dataset
stops = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Stopwords.csv', skiprows=0, low_memory=False)
stops.columns = ['Word']
# Load Positive Words Dataset
positive_words = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Positive-words.csv', skiprows=0, low_memory=False ,sep='\t')
positive_words.columns = ['Word', 'Sentiment']
# Load Negative Words Dataset
negative_words = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Negative-words.csv', skiprows=0, low_memory=False ,sep='\t',encoding= 'unicode_escape')
negative_words.columns = ['Word', 'Sentiment']
# Load Negations Dataset
negation_words = pd.read_csv(r'C:\Users\USER\Desktop\BE_datasets\Negation.csv', skiprows=0, low_memory=False)





# Get the Data
consumerkey= ''
consumersecret=''
accesstoken=''
accesstokensecret=''


# Create the authentication object
authenticate= tweepy.OAuthHandler(consumerkey, consumersecret)
# Set the access token and access token secret
authenticate.set_access_token(accesstoken,accesstokensecret)
# Create the API object while passing in the auth information
api=tweepy.API(authenticate, wait_on_rate_limit=True)





twitterusername=input("Type the twitter username that you want to analyze his or her tweets: ")
tweetnumber=input("How many tweets do you want to extract? ")



 # Extracting 100 Tweets from the twitter user
posts= api.user_timeline(screen_name=twitterusername, count=tweetnumber, lang="en", tweet_mode="extended")


# Create a dataframe with a column called Tweets
Newtweets = pd.DataFrame( [tweet.full_text for tweet in posts] , columns=['Tweets'])

pd.set_option('display.max_colwidth',-1)

Newtweets 

In [None]:
# Preprocessing
# We should do all preprocessing that was already done to the dataset we already applied our classifier to

# Remove RTs
def cleanTxt(text):
    text= re.sub(r'RT[\s]+', '', text)
    text= re.sub(r'[0-9]+', '', text) 
    return text
# Cleaning the text
Newtweets['Tweets']= Newtweets['Tweets'].apply(cleanTxt)

import re # Regular Expressions

# Creating the functions to detect and replace emoticons  

def make_emoticon_pattern3(emoticons):
    pattern = "|".join(map(re.escape, emoticons.Smiley))
    pattern = "(?<=\s)(" + pattern + ")(?=\s)"
    return pattern

def find_with_pattern3(pattern, replace=False, tag=None):
    if replace and tag == None:
        raise Exception("Parameter error", "If replace=True you should add the tag by which the pattern will be replaced")
    regex = re.compile(pattern)
    if replace:
        return Newtweets.Tweets.apply(lambda tweet: re.sub(pattern, tag, " " + tweet + " "))
    return Newtweets.Tweets.apply(lambda tweet: re.findall(pattern, " " + tweet + " "))

pos_emoticons_found = find_with_pattern3(make_emoticon_pattern3(positive_emoticons))
neg_emoticons_found = find_with_pattern3(make_emoticon_pattern3(negative_emoticons))

nb_pos_emoticons = len(pos_emoticons_found[pos_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])
nb_neg_emoticons = len(neg_emoticons_found[neg_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])


Newtweets.Tweets = find_with_pattern3(make_emoticon_pattern3(positive_emoticons), True, '||pos||')
Newtweets.Tweets = find_with_pattern3(make_emoticon_pattern3(negative_emoticons), True, '||neg||')

# Replacing URLS

pattern_url = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
url_found = find_with_pattern3(pattern_url)

Newtweets.Tweets = find_with_pattern3(pattern_url, True, '||url||')

# Remove Unicode characters 

def remove_unicode3(text):
    try:
        text = text.encode('ascii','ignore')
    except UnicodeDecodeError:
        pass
    return text

Newtweets.Tweets = Newtweets.Tweets.apply(lambda tweet: remove_unicode3(tweet))

def remove_unicode4(text):
    try:
        text = text.decode('unicode_escape')
    except UnicodeDecodeError:
        pass
    return text

Newtweets.Tweets = Newtweets.Tweets.apply(lambda tweet: remove_unicode4(tweet))

# Simply decode HTML entities
import html
# Convert tweets in unicode utf-8 to avoid mixing unicode with ascii and causing an error during unescape
Newtweets.Tweets  = Newtweets.Tweets.apply(lambda tweet: html.unescape(str(tweet)))

# Reduce all letters to lower case
Newtweets.Tweets = Newtweets.Tweets.str.lower()


pattern_usernames = "@\w{1,}"
usernames_found3 = find_with_pattern3(pattern_usernames)

# Replace all usernames/targets @ with the tag ||target||
Newtweets.Tweets = find_with_pattern3(pattern_usernames, True, '||target||')

from collections import Counter

# Create a dictionary of acronym which will be used to get translations
acronym_dictionary = dict(zip(acronyms.Acronym, acronyms.Translation))

# Will be used to get rid of the punctuation in tweets (does not include | since we use it for our tokens and ' 
# to take care of don't, can't)
punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{}~'

# Frequency table for acronyms
acronyms_counter = Counter()

# Loop on acronyms to replace those matched in the tweet by the corresponding translations
# Return the tweet and the acronyms used
def acronym_to_translation(tweet, acronyms_counter):
    table = str.maketrans(punctuation," " * len(punctuation))
    tweet = str(tweet).translate(table)
    words = tweet.split()
    new_words = []
    for i, word in enumerate(words):
        if acronym_dictionary.__contains__(word):
            acronyms_counter[word] += 1
            new_words.extend(acronym_dictionary[word].split())
        else:
            new_words.append(word)
    return new_words

Newtweets.Tweets = Newtweets.Tweets.apply(lambda tweet: acronym_to_translation(str(tweet), acronyms_counter))

# Transform the dataframe into a dictionary
negation_dictionary = dict(zip(negation_words.Negation, negation_words.Tag))

# Find a negation in a tweet and replace it by its tag
def replace_negation(tweet):
    return [negation_dictionary[word] if negation_dictionary.__contains__(word) else word for word in tweet]
    
# Apply the function on every tweet
Newtweets.Tweets = Newtweets.Tweets.apply(lambda tweet: replace_negation(tweet))

#Replace a sequence of repeated characters by two characters
pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

Newtweets.Tweets = Newtweets.Tweets.apply(lambda tweet: reduce_sequence_tweet(tweet))

# Transform the dataframe into a dictionary
stopword_dictionary = dict.fromkeys(stops.Word, None)

# Rejoin the tweets to start testing
Newtweets.Tweets = Newtweets.Tweets.apply(lambda tweet: " ".join(tweet))
pd.set_option('display.max_colwidth',-1)
Newtweets

In [None]:
# Now we load the pkl file that has the best model with its countvectorizer function to test it on our new data
# This was recorded as the best model that reached approximatly F1 score of 80% of all folds, and having recorded the one with the best fold in here
pkl_filename=(r'C:\Users\USER\Desktop\BE\Models\Multi_unibi_before_ss2.pkl')
# Load the model and the count vectorizer to start using them
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    pickle_cv=pickle.load(file)

In [None]:
# Plot The Word Cloud
allWords = ' '.join( [twts for twts in Newtweets['Tweets']])
wordCloud = WordCloud(width = 500, height=300, random_state = 21, max_font_size= 119).generate(allWords)
plt.imshow(wordCloud, interpolation= "bilinear")
plt.axis('off')
plt.show()

In [None]:
Keyword=input("Type a word that you want your tweets to be related to: ")
FilteredTweets=Newtweets[Newtweets['Tweets'].str.contains(Keyword)]
pd.set_option('display.max_colwidth',-1)
FilteredTweets = FilteredTweets.reset_index()
FilteredTweets = FilteredTweets.drop('index', 1)
FilteredTweets

In [None]:
# Detection of the Filtered tweets

# Transform our new data into bag of words using the countvectoroizer function applied in the loaded model
transformedtweets_filtered=pickle_cv.transform(FilteredTweets['Tweets'])
testpredictions_filtered=pickle_model.predict(transformedtweets_filtered)
# Transform our sentiments detected into a dataframe 
testpred_dt_filtered=pd.DataFrame(testpredictions_filtered, columns=['Sentiment']) 
# Join the tweets and their corresponding detected sentiment
FilteredTweets['Sentiment']= testpred_dt_filtered['Sentiment']
FilteredTweets.Sentiment = FilteredTweets.Sentiment.astype(float)
pd.set_option('display.max_colwidth',-1)
FilteredTweets



In [None]:
# Detection of all tweets retrieved

# Transform our new data into bag of words using the countvectoroizer function applied in the loaded model
transformedtweets=pickle_cv.transform(Newtweets['Tweets'])
testpredictions=pickle_model.predict(transformedtweets)
# Transform our sentiments detected into a dataframe 
testpred_dt=pd.DataFrame(testpredictions, columns=['Sentiment']) 
# Join the tweets and their corresponding detected sentiment
Newtweets['Sentiment']= testpred_dt['Sentiment']
Newtweets.Sentiment = Newtweets.Sentiment.astype(float)
pd.set_option('display.max_colwidth',-1)
Newtweets