In [14]:
import json 
import nltk
import os
import spacy
import sklearn
import numpy
import pathlib
from collections import Counter
from nltk.corpus import stopwords
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
path = "/Users/acemcakmak/Desktop/text-mining-project/sentiment-topic-test.tsv"
sentences = {}
with open(path, "r") as testFile: 
    testFile.readline()
    for row in testFile.readlines(): 
        elements = row.split("\t")
        sentences[elements[0]] = {"text":elements[1], "sentiment":elements[2], "topic": elements[3].replace("\n", "")}

for key, value in sentences.items(): 
    print("{" + key + ": " + str(value) + "}")

{0: {'text': "I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.", 'sentiment': 'negative', 'topic': 'sports'}}
{1: {'text': "Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.", 'sentiment': 'neutral', 'topic': 'movie'}}
{2: {'text': 'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!', 'sentiment': 'positive', 'topic': 'sports'}}
{3: {'text': 'Zendaya slayed in Dune 2, as she does in all her movies.', 'sentiment': 'positive', 'topic': 'movie'}}
{4: {'text': "While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame.", 'sentiment': 'negative', 'topic': 'sports'}}
{5: {'text': "My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time.", 'sentiment': 'neutral', 'topic': 'book'}}
{6: {'text': 'He said that The Great Gatsby is the best novell ever, and I w

In [12]:
test_sentences = []
for key, value in sentences.items(): 
    test_sentences.append(value["text"])
print(test_sentences)

["I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.", "Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.", 'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!', 'Zendaya slayed in Dune 2, as she does in all her movies.', "While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame.", "My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time.", 'He said that The Great Gatsby is the best novell ever, and I was about to throw hands.', 'I could not look away from this train wrck of a movie, on February 14th of all days.', "The film Everything Everywhere All At Once follows Evelyn Wang, a woman drowning under the stress of her family's failing laundromat.", 'I just finished reading pride and prejudice which had me HOOOKED from the beginning.']


In [13]:
true_sentiments = []
for key, value in sentences.items(): 
    true_sentiments.append(value["sentiment"])
print(true_sentiments)

['negative', 'neutral', 'positive', 'positive', 'negative', 'neutral', 'negative', 'negative', 'neutral', 'positive']


In [20]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('bipolar-sentiment-train-v1')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder))
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentiment_train.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

X_new_counts = count_vect.transform(test_sentences)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Shape of test data:", X_new_tfidf.shape)

predicted = clf.predict(X_new_tfidf)
predicted_sentiments = []
for doc, category in zip(test_sentences, predicted):
     print('%r => %s' % (doc, sentiment_train.target_names[category]))
     predicted_sentiments.append(sentiment_train.target_names[category])
print(predicted_sentiments)
print("\n")
print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))



Number of training samples: 2000
Categories: ['negative', 'positive']
Shape of test data: (10, 39659)
"I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift." => negative
"Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial." => negative
'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!' => positive
'Zendaya slayed in Dune 2, as she does in all her movies.' => negative
"While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame." => negative
"My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time." => positive
'He said that The Great Gatsby is the best novell ever, and I was about to throw hands.' => negative
'I could not look away from this train wrck of a movie, on February 14th of all days.' => negative
"The film Everything Everywhere All At Once follow

In [5]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('tripolar-sentiment-train-v1')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder))
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentiment_train.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

X_new_counts = count_vect.transform(test_sentences)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Shape of test data:", X_new_tfidf.shape)

predicted = clf.predict(X_new_tfidf)
for doc, category in zip(test_sentences, predicted):
     print('%r => %s' % (doc, sentiment_train.target_names[category]))
     predicted_sentiments.append(sentiment_train.target_names[category])
print(predicted_sentiments)
print("\n")
print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))



Number of training samples: 15662
Categories: ['negative', 'neutral', 'positive']
Shape of test data: (10, 24682)
"I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift." => negative
"Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial." => negative
'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!' => neutral
'Zendaya slayed in Dune 2, as she does in all her movies.' => neutral
"While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame." => negative
"My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time." => neutral
'He said that The Great Gatsby is the best novell ever, and I was about to throw hands.' => neutral
'I could not look away from this train wrck of a movie, on February 14th of all days.' => negative
"The film Everything Everywhere All At Onc

In [6]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('tripolar-sentiment-train-v2')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder))
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentiment_train.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

X_new_counts = count_vect.transform(test_sentences)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Shape of test data:", X_new_tfidf.shape)

predicted = clf.predict(X_new_tfidf)
for doc, category in zip(test_sentences, predicted):
     print('%r => %s' % (doc, sentiment_train.target_names[category]))
     predicted_sentiments.append(sentiment_train.target_names[category])
print(predicted_sentiments)
print("\n")
print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))



Number of training samples: 30000
Categories: ['negative', 'neutral', 'positive']
Shape of test data: (10, 48929)
"I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift." => negative
"Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial." => positive
'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!' => negative
'Zendaya slayed in Dune 2, as she does in all her movies.' => negative
"While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame." => negative
"My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time." => positive
'He said that The Great Gatsby is the best novell ever, and I was about to throw hands.' => negative
'I could not look away from this train wrck of a movie, on February 14th of all days.' => negative
"The film Everything Everywhere All At

In [7]:
vader_model = SentimentIntensityAnalyzer()
nlp = spacy.load('en_core_web_sm')

def run_vader(textual_unit, lemmatize=False, parts_of_speech_to_consider=None, verbose=0):
    doc = nlp(textual_unit)
    input_to_vader = []
    for sent in doc.sents:
        for token in sent:
            to_add = token.text
            if lemmatize:
                to_add = token.lemma_
                if to_add == '-PRON-': 
                    to_add = token.text
            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('Input Sentence:', sent)
        print('Input to VADER:', input_to_vader)
        print('VADER Output:', scores)

    return scores

for sentence in test_sentences:
    scores = run_vader(sentence, verbose=1)
    compound_score = scores["compound"]
    if compound_score > 0.05:
        print("VADER Result: positive")
    elif compound_score < -0.05:
        print("VADER Result: negative")
    else:
        print("VADER Result: neutral")


Input Sentence: I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
Input to VADER: ['I', 'would', "n't", 'be', 'caught', 'dead', 'watching', 'the', 'NFL', 'if', 'it', 'were', "n't", 'for', 'Taylor', 'Swift', '.']
VADER Output: {'neg': 0.088, 'neu': 0.721, 'pos': 0.191, 'compound': 0.431}
VADER Result: positive

Input Sentence: Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.
Input to VADER: ['Chris', "O'Donnell", 'stated', 'that', 'while', 'filming', 'for', 'this', 'movie', ',', 'he', 'felt', 'like', 'he', 'was', 'in', 'a', 'Toys', "''", 'R', "''", 'Us', 'commercial', '.']
VADER Output: {'neg': 0.0, 'neu': 0.884, 'pos': 0.116, 'compound': 0.3612}
VADER Result: positive

Input Sentence: The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!
Input to VADER: ['The', 'whole', 'game', 'was', 'a', 'rollercoaster', 'ride', ',', 'but', 'Los', 'Angeles', 'Lakers', 'ultima

In [8]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('tripolar-sentiment-train-v3')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder))
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentiment_train.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

X_new_counts = count_vect.transform(test_sentences)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Shape of test data:", X_new_tfidf.shape)

predicted = clf.predict(X_new_tfidf)
for doc, category in zip(test_sentences, predicted):
     predicted_sentiments.append(sentiment_train.target_names[category])
print(predicted_sentiments)
print("\n")
print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))



Number of training samples: 27480
Categories: ['negative', 'neutral', 'positive']
Shape of test data: (10, 26438)
"I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift." => neutral
"Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial." => neutral
'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!' => neutral
'Zendaya slayed in Dune 2, as she does in all her movies.' => neutral
"While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame." => neutral
"My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time." => neutral
'He said that The Great Gatsby is the best novell ever, and I was about to throw hands.' => positive
'I could not look away from this train wrck of a movie, on February 14th of all days.' => neutral
"The film Everything Everywhere All At Once f

In [9]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('bipolar-sentiment-train-v2')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder))
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentiment_train.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

X_new_counts = count_vect.transform(test_sentences)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Shape of test data:", X_new_tfidf.shape)

predicted = clf.predict(X_new_tfidf)
for doc, category in zip(test_sentences, predicted):
     predicted_sentiments.append(sentiment_train.target_names[category])
print(predicted_sentiments)
print("\n")
print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))



Number of training samples: 10000
Categories: ['objective', 'subjective']
Shape of test data: (10, 20893)
"I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift." => subjective
"Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial." => subjective
'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!' => objective
'Zendaya slayed in Dune 2, as she does in all her movies.' => objective
"While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame." => subjective
"My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time." => objective
'He said that The Great Gatsby is the best novell ever, and I was about to throw hands.' => subjective
'I could not look away from this train wrck of a movie, on February 14th of all days.' => objective
"The film Everything Everywhere Al