In [22]:
import json 
import nltk
import os
import spacy
import sklearn
import numpy
import pathlib
from collections import Counter
from nltk.corpus import stopwords
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
path = "/Users/acemcakmak/Desktop/text-mining-project/sentiment-topic-test.tsv"
sentences = {}
with open(path, "r") as testFile: 
    testFile.readline()
    for row in testFile.readlines(): 
        elements = row.split("\t")
        sentences[elements[0]] = {"text":elements[1], "sentiment":elements[2], "topic": elements[3].replace("\n", "")}

for key, value in sentences.items(): 
    print("{" + key + ": " + str(value) + "}")

{0: {'text': "I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.", 'sentiment': 'negative', 'topic': 'sports'}}
{1: {'text': "Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.", 'sentiment': 'neutral', 'topic': 'movie'}}
{2: {'text': 'The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!', 'sentiment': 'positive', 'topic': 'sports'}}
{3: {'text': 'Zendaya slayed in Dune 2, as she does in all her movies.', 'sentiment': 'positive', 'topic': 'movie'}}
{4: {'text': "While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame.", 'sentiment': 'negative', 'topic': 'sports'}}
{5: {'text': "My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time.", 'sentiment': 'neutral', 'topic': 'book'}}
{6: {'text': 'He said that The Great Gatsby is the best novell ever, and I w

In [None]:
test_sentences = []
for key, value in sentences.items(): 
    test_sentences.append(value["text"])
print(test_sentences)

In [None]:
true_sentiments = []
for key, value in sentences.items(): 
    true_sentiments.append(value["sentiment"])
print(true_sentiments)

In [None]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('tripolar-sentiment-train-v2')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder))
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)

count_vect = CountVectorizer(min_df=1,
                                tokenizer=nltk.word_tokenize, 
                                stop_words=stopwords.words('english'))
X_train_counts = count_vect.fit_transform(sentiment_train.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

X_new_counts = count_vect.transform(test_sentences)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print("Shape of test data:", X_new_tfidf.shape)

predicted = clf.predict(X_new_tfidf)
predicted_sentiments = []
for doc, category in zip(test_sentences, predicted):
     print('%r => %s' % (doc, sentiment_train.target_names[category]))
     predicted_sentiments.append(sentiment_train.target_names[category])
print(predicted_sentiments)
print("\n")
print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))

def important_features_per_class(vectorizer,classifier,n=100):
    class_labels = classifier.classes_
    feature_names =vectorizer.get_feature_names_out()
    topn_class1 = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]
    topn_class2 = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]
    topn_class3 = sorted(zip(classifier.feature_count_[2], feature_names),reverse=True)[:n]
    print("Important words in negative documents")
    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)
    print("-----------------------------------------")
    print("Important words in neutral documents")
    for coef, feat in topn_class2:
        print(class_labels[1], coef, feat) 
    print("-----------------------------------------")
    print("Important words in positive documents")
    for coef, feat in topn_class3:
        print(class_labels[2], coef, feat) 

important_features_per_class(count_vect, clf)


In [None]:
def search_specific_lines(zipped_data, search_term):
    for item_index, item in enumerate(zipped_data, start=1):
        for line_index, element in enumerate(item, start=1):
            if search_term in str(element): 
                yield f"Item {item_index}, {item}"


In [30]:
from nltk.tokenize import word_tokenize
sentences = ["The film Everything Everywhere All At Once follows Evelyn Wang, a woman drowning under the stress of her family's failing laundromat.", "My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time.", "The film Everything Everywhere All At Once follows Evelyn Wang, a woman drowning under the stress of her family's failing laundromat."]
tokens = []
for sentence in sentences:
    for token in word_tokenize(sentence): 
        tokens.append(token)
print(tokens)

['The', 'film', 'Everything', 'Everywhere', 'All', 'At', 'Once', 'follows', 'Evelyn', 'Wang', ',', 'a', 'woman', 'drowning', 'under', 'the', 'stress', 'of', 'her', 'family', "'s", 'failing', 'laundromat', '.', 'My', 'uncle', "'s", 'brother', "'s", 'neighbor', "'s", 'cat', "'s", 'veterinarian', 'David', 'reads', 'the', 'communist', 'manifesto', 'in', 'his', 'spare', 'time', '.', 'The', 'film', 'Everything', 'Everywhere', 'All', 'At', 'Once', 'follows', 'Evelyn', 'Wang', ',', 'a', 'woman', 'drowning', 'under', 'the', 'stress', 'of', 'her', 'family', "'s", 'failing', 'laundromat', '.']


In [32]:
topn_class1 = sorted(zip(clf.feature_count_[0], count_vect.get_feature_names_out()),reverse=True)[:100]
for token in tokens: 
    for result in search_specific_lines(topn_class1, token):
        print(result)
    print("\n\n")































Item 4, (307.1358555571413, ',')



Item 12, (115.42331721495621, 'today')
Item 16, (103.78591575378886, 'day')
Item 21, (93.74631256426235, 'ca')
Item 22, (92.00547667686779, 'back')
Item 24, (87.09927813197174, 'really')
Item 25, (83.74663662783482, 'want')
Item 26, (82.68845698255832, 'sad')
Item 42, (63.38182815344688, 'last')
Item 47, (60.0637054213502, 'na')
Item 49, (59.14852315522275, 'bad')
Item 50, (59.01916825436475, 'hate')
Item 71, (43.51070341238062, 'cant')
Item 80, (38.208678707803756, 'amp')
Item 81, (37.9911727673925, 'way')
Item 83, (36.66281940345311, 'days')
Item 85, (36.33893751358124, 'make')
Item 89, (35.331007949400835, 'early')
Item 90, (35.02803672610299, 'class')
Item 100, (32.67067439878012, 'wan')



























Item 10, (159.04243698065815, "'s")









Item 1, (524.1101744950885, '.')
Item 1, (524.1101744950885, '.')
Item 2, (472.349054316264, '!')
Item 3, (331.66566343939223, '@')
Item 4, (307.1358555571413, '

In [33]:
topn_class2 = sorted(zip(clf.feature_count_[1], count_vect.get_feature_names_out()),reverse=True)[:100]
for token in tokens: 
    for result in search_specific_lines(topn_class2, token):
        print(result)
    print("\n\n")































Item 5, (250.2104459191718, ',')



Item 17, (109.41032507411963, 'apple')
Item 20, (86.87514408159461, 'aapl')
Item 22, (80.0159872894374, 'earnings')
Item 23, (72.64387291252146, 'market')
Item 34, (55.39786825722083, 'trading')
Item 36, (53.93285225422702, 'says')
Item 37, (52.756313098306485, 'amzn')
Item 38, (51.57638312040774, 'declares')
Item 42, (45.658171888249385, 'trade')
Item 43, (44.949673055890834, 'call')
Item 45, (43.727999023125676, 'markets')
Item 47, (43.53758707552413, 'amp')
Item 48, (43.35231751338098, 'nasdaq')
Item 51, (42.247175911145696, 'marketscreener')
Item 61, (37.069673920128686, 'stockmarket')
Item 63, (36.86368679313153, 'financial')
Item 64, (36.5924002742901, 'deal')
Item 68, (35.64314490704441, 'today')
Item 70, (35.467807510545974, 'china')
Item 71, (35.445523199512095, 'presentation')
Item 73, (34.93688846899758, 'bank')
Item 76, (33.998568898398524, 'year')
Item 77, (33.946958978959756, 'update')
Item 82, (31.53704842

In [35]:
topn_class3 = sorted(zip(clf.feature_count_[2], count_vect.get_feature_names_out()),reverse=True)[:100]
for token in tokens: 
    for result in search_specific_lines(topn_class3, token):
        print(result)
    print("\n\n")































Item 4, (345.0065747773238, ',')



Item 12, (113.71577214860932, 'day')
Item 17, (103.4743666277276, 'thanks')
Item 24, (85.09594030892757, 'today')
Item 32, (71.47741040308709, 'great')
Item 42, (59.67117817457036, 'back')
Item 45, (55.920622306392204, 'happy')
Item 46, (54.03107085813554, 'haha')
Item 48, (51.561125656327704, 'really')
Item 49, (51.01239931842817, 'amp')
Item 53, (45.79388486489903, 'thank')
Item 64, (41.60303219915534, 'ca')
Item 65, (41.037681374208724, 'yay')
Item 69, (39.94357900953002, 'wait')
Item 70, (39.08404625200916, 'glad')
Item 71, (38.644104355039374, 'awesome')
Item 73, (37.80681751117792, 'want')
Item 77, (36.331638807149105, 'make')
Item 81, (35.79939955672274, 'yeah')
Item 82, (35.7519277797636, 'way')
Item 84, (35.49966206110515, 'days')
Item 86, (35.029723408832794, 'say')
Item 87, (34.25358014082221, 'watching')
Item 95, (31.169634172607896, 'na')
Item 98, (30.607730247963016, 'last')



























Item 

In [None]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('bipolar-sentiment-train-v1')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder), encoding="latin-1")
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)
accuracies = []

for min_df in range(1, 20): 
    count_vect = CountVectorizer(min_df=min_df,
                                tokenizer=nltk.word_tokenize, 
                                stop_words=stopwords.words('english'))
    X_train_counts = count_vect.fit_transform(sentiment_train.data)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

    X_new_counts = count_vect.transform(test_sentences)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    print("Shape of test data:", X_new_tfidf.shape)
    predicted = clf.predict(X_new_tfidf)

    predicted_sentiments = []
    for doc, category in zip(test_sentences, predicted):
        print('%r => %s' % (doc, sentiment_train.target_names[category]))
        predicted_sentiments.append(sentiment_train.target_names[category])
    print(predicted_sentiments)
    print("\n")
    print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))
    output_dict = classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3, output_dict = True)
    accuracies.append(output_dict["accuracy"])
print(accuracies)



In [None]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('bipolar-sentiment-train-v2')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder), encoding="latin-1")
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)
accuracies = []

for min_df in range(1, 20): 
    count_vect = CountVectorizer(min_df=min_df,
                                tokenizer=nltk.word_tokenize, 
                                stop_words=stopwords.words('english'))
    X_train_counts = count_vect.fit_transform(sentiment_train.data)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

    X_new_counts = count_vect.transform(test_sentences)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    print("Shape of test data:", X_new_tfidf.shape)

    predicted = clf.predict(X_new_tfidf)
    predicted_sentiments = []
    for doc, category in zip(test_sentences, predicted):
        print('%r => %s' % (doc, sentiment_train.target_names[category]))
        predicted_sentiments.append(sentiment_train.target_names[category])
    print(predicted_sentiments)
    print("\n")
    print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))
    output_dict = classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3, output_dict = True)
    accuracies.append(output_dict["accuracy"])
    
print(accuracies)



In [None]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('tripolar-sentiment-train-v1')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder), encoding="latin-1")
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)
accuracies = []

for min_df in range(1, 20): 
    count_vect = CountVectorizer(min_df=min_df,
                                tokenizer=nltk.word_tokenize, 
                                stop_words=stopwords.words('english'))
    X_train_counts = count_vect.fit_transform(sentiment_train.data)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

    X_new_counts = count_vect.transform(test_sentences)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    print("Shape of test data:", X_new_tfidf.shape)

    predicted = clf.predict(X_new_tfidf)
    predicted_sentiments = []
    for doc, category in zip(test_sentences, predicted):
        print('%r => %s' % (doc, sentiment_train.target_names[category]))
        predicted_sentiments.append(sentiment_train.target_names[category])
    print(predicted_sentiments)
    print("\n")
    print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))
    output_dict = classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3, output_dict = True)
    accuracies.append(output_dict["accuracy"])
print(accuracies)



In [None]:
cwd = pathlib.Path.cwd()
sentiment_folder = cwd.joinpath('tripolar-sentiment-train-v3')
if not sentiment_folder.exists():
    print('error: path does not exist:', sentiment_folder)

sentiment_train = load_files(str(sentiment_folder), encoding="latin-1")
print("Number of training samples:", len(sentiment_train.data))
print("Categories:", sentiment_train.target_names)
accuracies = []

for min_df in range(1, 20): 
    count_vect = CountVectorizer(min_df=min_df,
                                tokenizer=nltk.word_tokenize, 
                                stop_words=stopwords.words('english'))
    X_train_counts = count_vect.fit_transform(sentiment_train.data)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = MultinomialNB().fit(X_train_tfidf, sentiment_train.target)

    X_new_counts = count_vect.transform(test_sentences)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    print("Shape of test data:", X_new_tfidf.shape)

    predicted = clf.predict(X_new_tfidf)
    predicted_sentiments = []
    for doc, category in zip(test_sentences, predicted):
        print('%r => %s' % (doc, sentiment_train.target_names[category]))
        predicted_sentiments.append(sentiment_train.target_names[category])
    print(predicted_sentiments)
    print("\n")
    print(classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3))
    output_dict = classification_report(true_sentiments, predicted_sentiments, labels=sentiment_train.target_names, target_names=sentiment_train.target_names, digits = 3, output_dict = True)
    accuracies.append(output_dict["accuracy"])
print(accuracies)



In [None]:
vader_model = SentimentIntensityAnalyzer()
nlp = spacy.load('en_core_web_sm')

def run_vader(textual_unit, lemmatize=False, parts_of_speech_to_consider=None, verbose=0):
    doc = nlp(textual_unit)
    input_to_vader = []
    for sent in doc.sents:
        for token in sent:
            to_add = token.text
            if lemmatize:
                to_add = token.lemma_
                if to_add == '-PRON-': 
                    to_add = token.text
            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('Input Sentence:', sent)
        print('Input to VADER:', input_to_vader)
        print('VADER Output:', scores)

    return scores

for sentence in test_sentences:
    scores = run_vader(sentence, verbose=1)
    compound_score = scores["compound"]
    if compound_score > 0.05:
        print("VADER Result: positive")
    elif compound_score < -0.05:
        print("VADER Result: negative")
    else:
        print("VADER Result: neutral")