In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import string
import seaborn as sns
from html.parser import HTMLParser
from wordcloud import WordCloud
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# loading data
data_path = "../data/"
sa_df = pd.read_csv(data_path + "sentiment_analysis.csv", names=["ID", "text", "label"], low_memory=False)
sa_df = sa_df[1:]

us_df = pd.read_csv(data_path + "US_Elections_2020.csv", names=["text", "sentiment", "negative_reason"], low_memory=False)
us_df = us_df[1:]

stop_words_file = open(data_path + "stop_words.txt")
stop_words_set = set()

index = 0
stop_words_error_set = set([249, 495])
for line in stop_words_file:
    if index in stop_words_error_set:
        index += 1
        continue
    stop_words_set.add(line.replace("\n", ""))
    index += 1

# removing some stop words
stop_words_set.remove("w")
stop_words_set.remove("l")

# adding stop words that were incorrectly parsed in .txt file
stop_words_set.add("keep")
stop_words_set.add("keeps")
stop_words_set.add("sure")
stop_words_set.add("t")

# adding additional slang stop words
stop_words_set.add("ur")

# 2477 lines in corpus.txt
corpus_file_len = 2477
corpus_file = open(data_path + "corpus.txt")
corpus_words = np.array([" " for _ in range(corpus_file_len)], dtype=object)
corpus_counts = np.zeros((corpus_file_len,))

index = 0

def sum_line_array(line_array_splice):
    string = ""
    for word in line_array_splice:
        string += word + " "
    return string[:-1]

for line in corpus_file:
    line_array = line.split()
    corpus_words[index], corpus_counts[index] = sum_line_array(line_array[:-1]), line_array[-1]
    index += 1

corpus_df_dict = {"word": pd.Series(corpus_words), "count": pd.Series(corpus_counts)}
corpus_df = pd.DataFrame(corpus_df_dict)

In [3]:
print("Shape of sentiment analysis DataFrame:", sa_df.shape)
print("Shape of US elections DataFrame:", us_df.shape)
print("Shape of corpus DataFrame:", corpus_df.shape)
print("Length of stop words set:", len(stop_words_set))

Shape of sentiment analysis DataFrame: (550391, 3)
Shape of US elections DataFrame: (2552, 3)
Shape of corpus DataFrame: (2477, 2)
Length of stop words set: 666


In [4]:
print("df.head of sentiment analysis DataFrame:", sa_df.head())
print("\ndf.head of US elections DataFrame:", us_df.head())
print("\ndf.head of corpus DataFrame:", corpus_df.head())

df.head of sentiment analysis DataFrame:             ID                                               text label
1  7.68098E+17  Josh Jenkins is looking forward to TAB Breeder...     1
2  7.68098E+17  RT @MianUsmanJaved: Congratulations Pakistan o...     1
3  7.68098E+17  RT @PEPalerts: This September, @YESmag is taki...     1
4  7.68098E+17  RT @david_gaibis: Newly painted walls, thanks ...     1
5  7.68098E+17  RT @CedricFeschotte: Excited to announce: as o...     1

df.head of US elections DataFrame:                                                 text sentiment negative_reason
1  b'@robreiner so afraid of Nov, Dec, and Jan! E...         0         covid19
2  b"RT @SueC00K: Lord Sumption launches Recovery...         0          others
3  b'RT @WalidPhares: Uber Timing: after #Biden a...         0         covid19
4  b'Every 107 seconds an American is dying from ...         1             NaN
5  b'RT @thewebbix: The #Democrats embrace #Diver...         1             NaN

df.head of corpu

In [5]:
# cleaning tweet data
tweet_tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
token_lemmatizer = nltk.stem.WordNetLemmatizer()

def map_text(tweet):
    input_tweet = tweet
    # removing URLs
    tweet = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', tweet, flags=re.MULTILINE)

    # removing "b" at beginning of all US elections tweets
    tweet = re.sub(r'^b', '', tweet)
 
    # removing "'" and '"' at beginning of all US elections tweets
    tweet = re.sub(r'^[\'"]', '', tweet)

    # removing 'RT'
    tweet = re.sub(r'^RT ', '', tweet)

    # removing mentions and handles
    tweet = re.sub(r'@\S* ?', '', tweet)

    # replacing new lines with spaces
    tweet = re.sub(r'\\n', ' ', tweet)

    # replacing html tags & attributes (/<[^>]+>\) + hashtags (#)
    unwanted_html_elements = "\/<[^>]+>#"    
    for unwanted_element in unwanted_html_elements:
        tweet = tweet.replace(unwanted_element, "")

    
    # replacing html character codes (&...)
    h = HTMLParser()
    tweet = h.unescape(tweet)

    # removing emojis
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')

    # changing text to lowercase
    tweet = tweet.lower()

    # replacing "," and "." with spaces
    tweet = re.sub(r'[,.]', ' ', tweet)

    # removing punctuation
    tweet = re.sub(r'['+string.punctuation+']+s?', '', tweet)

    # tokenizing tweet
    tokenized_tweet = tweet_tokenizer.tokenize(tweet)

    # lemmatizing tokens
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = token_lemmatizer.lemmatize(tokenized_tweet[i])

    # removing stop words & numbers
    clean_tweet = []
    kept_dates = set(["2008", "2016", "2017", "2018", "2019", "2020", "2021"])
    for word in tokenized_tweet:
        if word not in stop_words_set:
            if word in kept_dates or not re.match(r'[0-9]', word): 
                clean_tweet.append(word)

    return clean_tweet


example_tweet_1 = 'b"RT @GayHopper_com: I\'m happy! \n\n#booyah https://t.co/fKXbN3Zhtd"'
print(map_text(example_tweet_1))

example_tweet_2 = "b'@JoeBiden JOE BIDEN IS TOTALLY AND COMPLETELY COMPROMISED BY CHINA.\n\nJoe Biden is a corrupt politician. He wants https://t.co/wuUCpVzOo6'"
print(map_text(example_tweet_2))

example_tweet_3 = "b'#COVID19 #coronavirus #coronavirusuk #COVID #UKlockdown \nIf there is a National Lockdown Next Week \nShould We https://t.co/A7h6gAD2cz'"
print(map_text(example_tweet_3))

example_tweet_4 = "b\"RT @MelissaTweets: I didn't think all the freak out was about control in the beginning of \#Covid.\n\nNow, it's clear that's ALL it's'"
print(map_text(example_tweet_4))

example_tweet_5 = "b'RT @DennisClend23: @amyisfedtfup @GOP @IvankaTrump I wish 90,000 Americans been infected &amp; 900+ died yesterday, along with the 2019 virus"
print(map_text(example_tweet_5))

example_tweet_6 = "b'Share, we need to change this!\n#CHANGES \n#WomensMarch2020 \n#BidenHarris2020 https://t.co/8S90buAjxY'"
print(map_text(example_tweet_6))

example_tweet_7 = "b'RT @TimFaulkner81: @realDonaldTrump Voting ends November 3rd. Then we count ALL VOTES.\n\nYou sound really desperate, Donny.\n\n#BidenHarris2020'"
print(map_text(example_tweet_7))

# applying mapping
sa_df["clean_text"] = sa_df["text"].map(map_text)
us_df["clean_text"] = us_df["text"].map(map_text)

['happy', 'booyah']
['joe', 'biden', 'totally', 'completely', 'compromised', 'china', 'joe', 'biden', 'corrupt', 'politician']
['covid', 'coronavirus', 'coronavirusuk', 'covid', 'uklockdown', 'national', 'lockdown', 'week']
['didnt', 'freak', 'wa', 'control', 'covid', 'clear']
['american', 'infected', 'died', 'yesterday', '2019', 'virus']
['share', 'change', 'change', 'womensmarch', '2020', 'bidenharris', '2020']
['voting', 'november', 'count', 'vote', 'sound', 'desperate', 'donny', 'bidenharris', '2020']


In [6]:
for i in range(1, 300):
    print("\nOriginal Text:\n", sa_df["text"][i])
    print("\nClean Text:\n", sa_df["clean_text"][i])


Original Text:
 Josh Jenkins is looking forward to TAB Breeders Crown Super Sunday https://t.co/antImqAo4Y https://t.co/ejnA78Sks0

Clean Text:
 ['josh', 'jenkins', 'forward', 'tab', 'breeder', 'crown', 'super', 'sunday']

Original Text:
 RT @MianUsmanJaved: Congratulations Pakistan on becoming #No1TestTeam in the world against all odds! #JI_PakZindabadRallies https://t.co/1o‚Ä¶

Clean Text:
 ['congratulation', 'pakistan', 'no1testteam', 'odds', 'jipakzindabadrallies']

Original Text:
 RT @PEPalerts: This September, @YESmag is taking you to Maine Mendoza‚Äôs surprise thanksgiving party she threw for her fans! https://t.co/oX‚Ä¶

Clean Text:
 ['september', 'maine', 'mendozas', 'surprise', 'thanksgiving', 'party', 'threw', 'fan']

Original Text:
 RT @david_gaibis: Newly painted walls, thanks a million to our custodial painters this summer.  Great job ladies!!!#EC_proud https://t.co/‚Ä¶

Clean Text:
 ['newly', 'painted', 'wall', 'custodial', 'painter', 'summer', 'great', 'job', 'ladiesec

In [7]:
print("Sentiment analysis df.head():\n", sa_df.head())

Sentiment analysis df.head():
             ID                                               text label  \
1  7.68098E+17  Josh Jenkins is looking forward to TAB Breeder...     1   
2  7.68098E+17  RT @MianUsmanJaved: Congratulations Pakistan o...     1   
3  7.68098E+17  RT @PEPalerts: This September, @YESmag is taki...     1   
4  7.68098E+17  RT @david_gaibis: Newly painted walls, thanks ...     1   
5  7.68098E+17  RT @CedricFeschotte: Excited to announce: as o...     1   

                                          clean_text  
1  [josh, jenkins, forward, tab, breeder, crown, ...  
2  [congratulation, pakistan, no1testteam, odds, ...  
3  [september, maine, mendozas, surprise, thanksg...  
4  [newly, painted, wall, custodial, painter, sum...  
5  [excited, july, 2017, feschotte, lab, will, re...  


In [8]:
sa_df_empty_tweet_mask = sa_df["clean_text"].apply(lambda x: len(x) == 0)
empty_tweet_count = sum(sa_df_empty_tweet_mask)
print("Empty Tweet count =", empty_tweet_count)

positive_empty_tweets = sum(sa_df["label"][sa_df_empty_tweet_mask].astype(int))
print("Number of positive empty Tweets = {} ({}%)".format(positive_empty_tweets, round(positive_empty_tweets/empty_tweet_count *100, 3)))
del sa_df_empty_tweet_mask

Empty Tweet count = 1864
Number of positive empty Tweets = 1167 (62.607%)


In [9]:
print("US elections df original and clean Tweet visualization:")
for i in range(1, 300):
    print("\nIndex =", i)
    print("Original Text:\n", us_df["text"][i])
    print("Clean Text:\n", us_df["clean_text"][i])

US elections df original and clean Tweet visualization:

Index = 1
Original Text:
 b'@robreiner so afraid of Nov, Dec, and Jan! Even if #BidenHarris2020 win...frump has 3 months to do even more damage than he has.'
Clean Text:
 ['afraid', 'nov', 'dec', 'jan', 'bidenharris', '2020', 'win', 'frump', 'ha', 'month', 'damage', 'ha']

Index = 2
Original Text:
 b"RT @SueC00K: Lord Sumption launches Recovery - a new initiative to promote #AnotherWay to deal with #Covid. Hysteria and rushed laws
Clean Text:
 ['lord', 'sumption', 'launch', 'recovery', 'initiative', 'promote', 'anotherway', 'deal', 'covid', 'hysteria', 'rushed', 'law']

Index = 3
Original Text:
 b'RT @WalidPhares: Uber Timing: after #Biden advisors said they would put pressure on #Brazil &amp; sanction @jairbolsonaro Gov "when" elected,
Clean Text:
 ['uber', 'timing', 'biden', 'advisor', 'pressure', 'brazil', 'sanction', 'gov', 'elected']

Index = 4
Original Text:
 b'Every 107 seconds an American is dying from the #TrumpVirus \n\

In [10]:
republican_buzzwords = set(["maga", "keepamericagreat", "trumppence", "buildthewall", "sleepyjoe", "chinajoe", "blackvoicesfortrump", "hunterbiden", "veteransfortrump", "redwave", "corruptjoebiden", "fakenews"])
republican_compound_buzzwords = set(["trump2020", "trumppence2020", "votedtrump"])

democrat_buzzwords = set(["bidenharris", "blacklivesmatter", "trumpvirus", "trumpcrimefamily", "bidenharristosaveamerica", "trumpiscompromised", "trumppandemic", "bluewave", "votetrumpout", "votehimout"])
democrat_compound_buzzwords = set(["biden2020", "bidenharris2020", "bidenharris"])

def classify_party(tweet_words):
    compound_words = set()
    for i in range(len(tweet_words)-1):
        compound_words.add(tweet_words[i] + tweet_words[i+1])
    

    # 0 = democrat
    if len(set(tweet_words).intersection(democrat_buzzwords)) > 0 or len(compound_words.intersection(democrat_compound_buzzwords)) > 0:
        return 0  
    # 1 = republican  
    elif len(set(tweet_words).intersection(republican_buzzwords)) > 0 or len(compound_words.intersection(republican_compound_buzzwords)) > 0:
        return 1
    # 2 = unclassified
    else:
        return 2

us_df["party"] = us_df["clean_text"].map(classify_party)

In [11]:
print("Number of left-leaning tweets:", sum(us_df["party"] == 0))
print("Number of right-leaning tweets:", sum(us_df["party"] == 1))
print("Number of unclassified tweets:", sum(us_df["party"] == 2))

Number of left-leaning tweets: 452
Number of right-leaning tweets: 159
Number of unclassified tweets: 1941


In [12]:
sa_df["word_counts"] = sa_df["clean_text"].apply(lambda x: len(x))

In [13]:
# f, ax = plt.subplots(figsize=(20, 10))
# sns.countplot(x="word_counts", hue="label", data=sa_df, palette="dark")
# ax.legend(["Negative", "Positive"])
# plt.grid(True)
# plt.title("Word Count Distribution of Positive and Negative Tweets")
# plt.xlabel("Word Counts")
# plt.ylabel("Frequency")
# plt.show()

In [14]:
# us_clean_text_list = list(us_df['clean_text'])
# flat_us_clean_text = [item for sublist in us_clean_text_list for item in sublist]

# wordcloud = WordCloud(max_font_size=300, max_words=1000, width=4000, height=3000,collocations=False).generate(" ".join(flat_us_clean_text))
# plt.figure(figsize=(20,10))
# plt.imshow(wordcloud)
# plt.title('Most Commonly Used Words: US Election Tweets',fontsize=35)
# plt.axis("off")
# plt.show()

In [15]:
# bag of words
# vectorizer = CountVectorizer()
# sa_vocab = []
# for i in range(sa_df.shape[0]):
#     sa_vocab = sa_vocab + sa_df["clean_text"].iloc[i]

# sa_dfx = vectorizer.fit_transform(sa_vocab)

In [16]:
def create_sentences(tweet):
    # removing URLs
    tweet = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', tweet, flags=re.MULTILINE)

    # removing "b" at beginning of all US elections tweets
    tweet = re.sub(r'^b', '', tweet)
 
    # removing "'" and '"' at beginning of all US elections tweets
    tweet = re.sub(r'^[\'"]', '', tweet)

    # removing 'RT'
    tweet = re.sub(r'^RT ', '', tweet)

    # removing mentions and handles
    tweet = re.sub(r'@\S* ?', '', tweet)

    # replacing new lines with spaces
    tweet = re.sub(r'\\n', ' ', tweet)

    # replacing html tags & attributes (/<[^>]+>\) + hashtags (#)
    unwanted_html_elements = "\/<[^>]+>#"    
    for unwanted_element in unwanted_html_elements:
        tweet = tweet.replace(unwanted_element, "")

    # replacing html character codes (&...)
    h = HTMLParser()
    tweet = h.unescape(tweet)

    # removing emojis
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')

    # changing text to lowercase
    tweet = tweet.lower()

    # replacing "," and "." with spaces
    tweet = re.sub(r'[,.]', ' ', tweet)

    # removing punctuation
    tweet = re.sub(r'['+string.punctuation+']+s?', '', tweet)

    # tokenizing tweet
    tokenized_tweet = tweet_tokenizer.tokenize(tweet)

    # lemmatizing tokens
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = token_lemmatizer.lemmatize(tokenized_tweet[i])

    # removing stop words & numbers & creating sentence
    sentence = ""
    kept_dates = set(["2008", "2016", "2017", "2018", "2019", "2020", "2021"])
    for word in tokenized_tweet:
        if word not in stop_words_set:
            if word in kept_dates or not re.match(r'[0-9]', word): 
                sentence = sentence + word + " "

    return sentence

sa_df["clean_sentences"] = sa_df["text"].map(create_sentences)
us_df["clean_sentences"] = us_df["text"].map(create_sentences)

In [17]:
# bag of words
vectorizer = CountVectorizer(max_features=100)
sa_dfx = vectorizer.fit_transform(sa_df["clean_sentences"])

# splitting data
sa_dfy = sa_df["label"]
sa_X_train, sa_X_test, sa_y_train, sa_y_test = train_test_split(sa_dfx.toarray(), sa_dfy, random_state=88, test_size=0.3)

In [18]:
model = LogisticRegression(multi_class="auto", solver="liblinear")
model.fit(sa_X_train, sa_y_train)
predictions = model.predict(sa_X_test)

TN = confusion_matrix(sa_y_test, predictions)[0][0]
FP = confusion_matrix(sa_y_test, predictions)[0][1]
FN = confusion_matrix(sa_y_test, predictions)[1][0]
TP = confusion_matrix(sa_y_test, predictions)[1][1]

total = TN + FP + FN + TP
ACC = (TP + TN) / float(total)

print ("This model got an accuracy of {}% on the test dataset".format(round(ACC*100,2))) 

This model got an accuracy of 82.19% on the test dataset


In [18]:
def k_fold_cv(model, k, X_train, y_train):
    kfold = KFold(n_splits=k)
    kfold.get_n_splits(X_train)

    accuracy = np.zeros(10)
    np_idx = 0


    for train_idx, test_idx in kfold.split(X_train):
        k_X_train, k_X_test = X_train[train_idx], X_train[test_idx]
        k_y_train, k_y_test = y_train.values[train_idx], y_train.values[test_idx]
        
        model.fit(k_X_train, k_y_train)
        
        predictions = model.predict(k_X_test)
        
        TN = confusion_matrix(k_y_test, predictions)[0][0]
        FP = confusion_matrix(k_y_test, predictions)[0][1]
        FN = confusion_matrix(k_y_test, predictions)[1][0]
        TP = confusion_matrix(k_y_test, predictions)[1][1]
        total = TN + FP + FN + TP
        ACC = (TP + TN) / float(total)
        
        accuracy[np_idx] = ACC*100
        np_idx += 1
        
        print("Fold {}: Accuracy: {}%".format(np_idx, round(ACC*100,3)))   

    print("Average Score: {}%".format(round(np.mean(accuracy), 3)))
    print("Accuracy Standard Deviation: {}".format(round(np.std(accuracy), 3)))
    print("Accuracy Variance: {}".format(round(np.square(np.std(accuracy)), 3)))

    # returns accuracy and variance
    return np.mean(accuracy), np.square(np.std(accuracy))


def evaluate_model(model, X_train, y_train):
    _X_train, _X_valid, _y_train, _y_valid = train_test_split(X_train, y_train, random_state=88, test_size=0.3)

    model.fit(_X_train, _y_train)

    predictions = model.predict(_X_valid)
    
    TN = confusion_matrix(_y_valid, predictions)[0][0]
    FP = confusion_matrix(_y_valid, predictions)[0][1]
    FN = confusion_matrix(_y_valid, predictions)[1][0]
    TP = confusion_matrix(_y_valid, predictions)[1][1]
    total = TN + FP + FN + TP
    ACC = (TP + TN) / float(total)
    
    
    print("Model Accuracy: {}%".format(round(ACC*100,3)))


In [25]:
# logistic regression
model = LogisticRegression(solver="liblinear", multi_class="auto")
evaluate_model(model, sa_X_train, sa_y_train)

Model Accuracy: 82.355%


In [26]:
# kNN - works, but takes a very long time
model = KNeighborsClassifier(n_neighbors=1)
evaluate_model(model, sa_X_train, sa_y_train)

In [23]:
# Naive Bayes
model = GaussianNB()
evaluate_model(model, sa_X_train, sa_y_train)

Fold 1: Accuracy: 59.8%
Fold 2: Accuracy: 62.829%
Fold 3: Accuracy: 62.4%
Fold 4: Accuracy: 62.2%
Fold 5: Accuracy: 62.314%
Fold 6: Accuracy: 62.8%
Fold 7: Accuracy: 61.514%
Fold 8: Accuracy: 62.029%
Fold 9: Accuracy: 63.571%
Fold 10: Accuracy: 62.646%
Average Score: 62.21%
Accuracy Standard Deviation: 0.956
Accuracy Variance: 0.915


In [24]:
# SVM SVC - works, but takes a very long time
model = SVC()
evaluate_model(model, sa_X_train, sa_y_train)

Fold 1: Accuracy: 89.6%
Fold 2: Accuracy: 90.171%
Fold 3: Accuracy: 89.8%
Fold 4: Accuracy: 89.657%
Fold 5: Accuracy: 89.114%
Fold 6: Accuracy: 90.086%
Fold 7: Accuracy: 90.171%
Fold 8: Accuracy: 89.171%
Fold 9: Accuracy: 90.2%
Fold 10: Accuracy: 89.426%
Average Score: 89.74%
Accuracy Standard Deviation: 0.393
Accuracy Variance: 0.155


In [28]:
# Decision Tree Classifier
model = DecisionTreeClassifier()
evaluate_model(model, sa_X_train, sa_y_train)

Fold 1: Accuracy: 88.743%
Fold 2: Accuracy: 89.8%
Fold 3: Accuracy: 89.057%
Fold 4: Accuracy: 88.6%
Fold 5: Accuracy: 88.8%
Fold 6: Accuracy: 89.6%
Fold 7: Accuracy: 89.4%
Fold 8: Accuracy: 88.543%
Fold 9: Accuracy: 89.943%
Fold 10: Accuracy: 88.797%
Average Score: 89.128%
Accuracy Standard Deviation: 0.49
Accuracy Variance: 0.24


In [31]:
# Random Forest Classifier
model = RandomForestClassifier()
evaluate_model(model, sa_X_train, sa_y_train)

Fold 1: Accuracy: 89.114%
Fold 2: Accuracy: 90.143%
Fold 3: Accuracy: 89.343%
Fold 4: Accuracy: 89.143%
Fold 5: Accuracy: 89.2%
Fold 6: Accuracy: 89.971%
Fold 7: Accuracy: 89.857%
Fold 8: Accuracy: 88.971%
Fold 9: Accuracy: 90.257%
Fold 10: Accuracy: 89.111%
Average Score: 89.511%
Accuracy Standard Deviation: 0.464
Accuracy Variance: 0.216


In [33]:
# Gradient Boosting Classifier
model = GradientBoostingClassifier()
evaluate_model(model, sa_X_train, sa_y_train)

Fold 1: Accuracy: 88.514%
Fold 2: Accuracy: 88.771%
Fold 3: Accuracy: 88.6%
Fold 4: Accuracy: 88.714%
Fold 5: Accuracy: 87.286%
Fold 6: Accuracy: 88.543%
Fold 7: Accuracy: 88.657%
Fold 8: Accuracy: 87.486%
Fold 9: Accuracy: 88.457%
Fold 10: Accuracy: 88.568%
Average Score: 88.36%
Accuracy Standard Deviation: 0.497
Accuracy Variance: 0.247


In [20]:
# tf-idf
vectorizer = TfidfVectorizer(max_features=100)
sa_dfx = vectorizer.fit_transform(sa_df["clean_sentences"])

# splitting data
sa_dfy = sa_df["label"]
sa_X_train, sa_X_test, sa_y_train, sa_y_test = train_test_split(sa_dfx.toarray(), sa_dfy, random_state=88, test_size=0.3)

In [21]:
# logistic regression
model = LogisticRegression(solver="liblinear", multi_class="auto")
evaluate_model(model, sa_X_train, sa_y_train)

Model Accuracy: 82.344%


In [40]:
# kNN (works, but takes a long time)
k = 10
model = KNeighborsClassifier(n_neighbors=1)
k_fold_cv(model, k, sa_X_train, sa_y_train)

Fold 1: Accuracy: 87.057%
Fold 2: Accuracy: 80.829%
Fold 3: Accuracy: 86.6%
Fold 4: Accuracy: 87.086%
Fold 5: Accuracy: 81.857%
Fold 6: Accuracy: 81.429%
Fold 7: Accuracy: 81.257%
Fold 8: Accuracy: 80.686%
Fold 9: Accuracy: 81.714%
Fold 10: Accuracy: 87.968%
Average Score: 83.648%
Accuracy Standard Deviation: 2.918
Accuracy Variance: 8.513


In [41]:
# Naive Bayes
k = 10
model = GaussianNB()
k_fold_cv(model, k, sa_X_train, sa_y_train)

Fold 1: Accuracy: 59.8%
Fold 2: Accuracy: 62.829%
Fold 3: Accuracy: 62.4%
Fold 4: Accuracy: 62.2%
Fold 5: Accuracy: 62.314%
Fold 6: Accuracy: 62.8%
Fold 7: Accuracy: 61.514%
Fold 8: Accuracy: 62.029%
Fold 9: Accuracy: 63.571%
Fold 10: Accuracy: 62.646%
Average Score: 62.21%
Accuracy Standard Deviation: 0.956
Accuracy Variance: 0.915


In [43]:
# SVM - svc (works, but takes a long time)
k = 10
model = SVC()
k_fold_cv(model, k, sa_X_train, sa_y_train)

Fold 1: Accuracy: 89.6%
Fold 2: Accuracy: 90.171%
Fold 3: Accuracy: 89.8%
Fold 4: Accuracy: 89.657%
Fold 5: Accuracy: 89.114%
Fold 6: Accuracy: 90.086%
Fold 7: Accuracy: 90.171%
Fold 8: Accuracy: 89.171%
Fold 9: Accuracy: 90.2%
Fold 10: Accuracy: 89.426%
Average Score: 89.74%
Accuracy Standard Deviation: 0.393
Accuracy Variance: 0.155


In [44]:
# Decision Tree Classifier
k = 10
model = DecisionTreeClassifier()
k_fold_cv(model, k, sa_X_train, sa_y_train)

Fold 1: Accuracy: 88.657%
Fold 2: Accuracy: 89.857%
Fold 3: Accuracy: 89.086%
Fold 4: Accuracy: 88.629%
Fold 5: Accuracy: 88.886%
Fold 6: Accuracy: 89.571%
Fold 7: Accuracy: 89.429%
Fold 8: Accuracy: 88.629%
Fold 9: Accuracy: 89.857%
Fold 10: Accuracy: 88.797%
Average Score: 89.14%
Accuracy Standard Deviation: 0.473
Accuracy Variance: 0.224


In [45]:
# Random Forest Classifier
k = 10
model = RandomForestClassifier()
k_fold_cv(model, k, sa_X_train, sa_y_train)

Fold 1: Accuracy: 89.171%
Fold 2: Accuracy: 90.0%
Fold 3: Accuracy: 89.314%
Fold 4: Accuracy: 89.229%
Fold 5: Accuracy: 89.171%
Fold 6: Accuracy: 90.057%
Fold 7: Accuracy: 89.857%
Fold 8: Accuracy: 89.0%
Fold 9: Accuracy: 90.286%
Fold 10: Accuracy: 89.225%
Average Score: 89.531%
Accuracy Standard Deviation: 0.441
Accuracy Variance: 0.195


In [46]:
# Gradient Boosting Classifier
k = 10
model = GradientBoostingClassifier()
k_fold_cv(model, k, sa_X_train, sa_y_train)

Fold 1: Accuracy: 88.514%
Fold 2: Accuracy: 88.771%
Fold 3: Accuracy: 88.6%
Fold 4: Accuracy: 88.714%
Fold 5: Accuracy: 87.8%
Fold 6: Accuracy: 88.543%
Fold 7: Accuracy: 88.657%
Fold 8: Accuracy: 87.486%
Fold 9: Accuracy: 88.457%
Fold 10: Accuracy: 88.568%
Average Score: 88.411%
Accuracy Standard Deviation: 0.4
Accuracy Variance: 0.16


In [20]:
negative_us = us_df[us_df["sentiment"].astype(float) == False]

# tf-idf
vectorizer = TfidfVectorizer(max_features=100)
us_dfx = vectorizer.fit_transform(negative_us["clean_sentences"])

# splitting data
us_dfy = negative_us["negative_reason"]
us_X_train, us_X_test, us_y_train, us_y_test = train_test_split(us_dfx.toarray(), us_dfy, random_state=88, test_size=0.3)

In [45]:
print(us_dfy.unique())

['covid19' 'others' 'discrimination' 'corruption' 'crime']


In [25]:
# logistic regression
k = 10
best_model = None
best_accuracy = 0
for C in [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5]:
    for solver in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
        for regularization_type in ["l1", "l2"]:
            if  regularization_type == "l1" and solver in set(['newton-cg', 'sag', 'lbfgs']):
                continue
            model = LogisticRegression(C=C, solver=solver, penalty=regularization_type)

            print("\nC = {}, Solver = {}, Penalty = {}".format(C, solver, regularization_type))
            cv_acc, cv_var = k_fold_cv(model, k, us_X_train, us_y_train)

            if cv_acc > best_accuracy:
                best_accuracy = cv_acc
                best_model = "\nC = {}, Solver = {}, Penalty = {}, Accuracy = {}%, Var = {}".format(C, solver, regularization_type, round(cv_acc, 3), round(cv_var, 3))
print(best_model)


C = 0.001, Solver = newton-cg, Penalty = l2
Fold 1: Accuracy: 62.5%
Fold 2: Accuracy: 68.293%
Fold 3: Accuracy: 57.143%
Fold 4: Accuracy: 62.791%
Fold 5: Accuracy: 60.417%
Fold 6: Accuracy: 70.213%
Fold 7: Accuracy: 70.455%
Fold 8: Accuracy: 63.043%
Fold 9: Accuracy: 75.0%
Fold 10: Accuracy: 54.762%
Average Score: 64.462%
Accuracy Standard Deviation: 6.061
Accuracy Variance: 36.739

C = 0.001, Solver = lbfgs, Penalty = l2
Fold 1: Accuracy: 62.5%
Fold 2: Accuracy: 68.293%
Fold 3: Accuracy: 57.143%
Fold 4: Accuracy: 62.791%
Fold 5: Accuracy: 60.417%
Fold 6: Accuracy: 70.213%
Fold 7: Accuracy: 70.455%
Fold 8: Accuracy: 63.043%
Fold 9: Accuracy: 75.0%
Fold 10: Accuracy: 54.762%
Average Score: 64.462%
Accuracy Standard Deviation: 6.061
Accuracy Variance: 36.739

C = 0.001, Solver = liblinear, Penalty = l1
Fold 1: Accuracy: 37.5%
Fold 2: Accuracy: 31.707%
Fold 3: Accuracy: 42.857%
Fold 4: Accuracy: 37.209%
Fold 5: Accuracy: 39.583%
Fold 6: Accuracy: 29.787%
Fold 7: Accuracy: 29.545%
Fold 8:

In [24]:
# Decision Tree Classifier
k = 10
best_model = None
best_accuracy = 0
for criterion in ["gini", "entropy"]:
    for splitter in ["best", "random"]:
        model = DecisionTreeClassifier(criterion=criterion, splitter=splitter)

        print("\ncriterion = {}, splitter = {}".format(criterion, splitter))
        cv_acc, cv_var = k_fold_cv(model, k, us_X_train, us_y_train)

        if cv_acc > best_accuracy:
            best_accuracy = cv_acc
            best_model = "\ncriterion = {}, splitter = {}, Accuracy = {}%, Var = {}".format(criterion, splitter, round(cv_acc, 3), round(cv_var, 3))
print(best_model)



criterion = gini, splitter = best
Fold 1: Accuracy: 60.0%
Fold 2: Accuracy: 54.839%
Fold 3: Accuracy: 53.333%
Fold 4: Accuracy: 88.889%
Fold 5: Accuracy: 55.172%
Fold 6: Accuracy: 75.758%
Fold 7: Accuracy: 71.429%
Fold 8: Accuracy: 65.517%
Fold 9: Accuracy: 66.667%
Fold 10: Accuracy: 44.828%
Average Score: 63.643%
Accuracy Standard Deviation: 12.166
Accuracy Variance: 148.024

criterion = gini, splitter = random
Fold 1: Accuracy: 58.621%
Fold 2: Accuracy: 66.667%
Fold 3: Accuracy: 50.0%
Fold 4: Accuracy: 74.194%
Fold 5: Accuracy: 46.429%
Fold 6: Accuracy: 55.882%
Fold 7: Accuracy: 60.606%
Fold 8: Accuracy: 61.29%
Fold 9: Accuracy: 60.0%
Fold 10: Accuracy: 54.545%
Average Score: 58.823%
Accuracy Standard Deviation: 7.529
Accuracy Variance: 56.688

criterion = entropy, splitter = best
Fold 1: Accuracy: 64.516%
Fold 2: Accuracy: 55.172%
Fold 3: Accuracy: 50.0%
Fold 4: Accuracy: 71.429%
Fold 5: Accuracy: 56.667%
Fold 6: Accuracy: 65.517%
Fold 7: Accuracy: 71.429%
Fold 8: Accuracy: 67.857%

In [23]:
# RF
k = 10
best_model = None
best_accuracy = 0
for n_trees in [10, 50, 100, 500, 1000, 1500, 2000]:
    for criterion in ["gini", "entropy"]:
        model = RandomForestClassifier(n_estimators=n_trees, criterion=criterion)

        print("\nn_trees = {}, criterion = {}".format(n_trees, criterion))
        cv_acc, cv_var = k_fold_cv(model, k, us_X_train, us_y_train)

        if cv_acc > best_accuracy:
            best_accuracy = cv_acc
            best_model = "\nn_trees = {}, criterion = {}, Accuracy = {}%, Var = {}".format(n_trees, criterion, round(cv_acc, 3), round(cv_var, 3))
print(best_model)



n_trees = 10, criterion = gini
Fold 1: Accuracy: 55.882%
Fold 2: Accuracy: 58.621%
Fold 3: Accuracy: 47.059%
Fold 4: Accuracy: 75.0%
Fold 5: Accuracy: 58.824%
Fold 6: Accuracy: 64.516%
Fold 7: Accuracy: 61.765%
Fold 8: Accuracy: 70.833%
Fold 9: Accuracy: 62.857%
Fold 10: Accuracy: 56.0%
Average Score: 61.136%
Accuracy Standard Deviation: 7.516
Accuracy Variance: 56.485

n_trees = 10, criterion = entropy
Fold 1: Accuracy: 59.375%
Fold 2: Accuracy: 60.714%
Fold 3: Accuracy: 61.765%
Fold 4: Accuracy: 78.125%
Fold 5: Accuracy: 56.25%
Fold 6: Accuracy: 52.0%
Fold 7: Accuracy: 56.667%
Fold 8: Accuracy: 62.5%
Fold 9: Accuracy: 60.606%
Fold 10: Accuracy: 63.333%
Average Score: 61.134%
Accuracy Standard Deviation: 6.531
Accuracy Variance: 42.658

n_trees = 50, criterion = gini
Fold 1: Accuracy: 62.5%
Fold 2: Accuracy: 61.29%
Fold 3: Accuracy: 50.0%
Fold 4: Accuracy: 75.0%
Fold 5: Accuracy: 50.0%
Fold 6: Accuracy: 70.27%
Fold 7: Accuracy: 67.857%
Fold 8: Accuracy: 73.333%
Fold 9: Accuracy: 71.0

In [26]:
optimal_model = RandomForestClassifier(n_estimators=50, criterion="entropy")

model.fit(us_X_train, us_y_train)

predictions = model.predict(us_X_test)

TN = confusion_matrix(us_y_test, predictions)[0][0]
FP = confusion_matrix(us_y_test, predictions)[0][1]
FN = confusion_matrix(us_y_test, predictions)[1][0]
TP = confusion_matrix(us_y_test, predictions)[1][1]
total = TN + FP + FN + TP
ACC = (TP + TN) / float(total)


print("Model Accuracy on Test Set: {}%".format(round(ACC*100,3)))

Model Accuracy on Test Set: 57.6%
