In [1]:
import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import pandas
import pathlib
import numpy
from collections import Counter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import spacy

nlp = spacy.load('en_core_web_sm') # 'en_core_web_sm'
nltk.download('stopwords')
vader_model = SentimentIntensityAnalyzer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\feter\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path  = pandas.read_csv('C:/Users/feter/OneDrive/Bureaublad/AI/AI Year 3/p4/text mining/sentiment-topic-test.tsv', sep='\t')

In [3]:
#kaggle_dataset = pandas.read_csv(path, encoding= 'unicode_escape')

In [4]:
# for sent in path:
#     scores = vader_model.polarity_scores(sent)
#     print()
#     print('INPUT SENTENCE', sent)
#     print('VADER OUTPUT', scores)


In [5]:
def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=None,
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores

In [6]:
def vader_output_to_label(vader_output):
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'

In [19]:
gold =[]
all_vader_output=[]
target_names = ['negative', 'neutral', 'positive']
for id_, sent in enumerate(path['text']):
    #print(sent)
    run_vader(sent,lemmatize=True, verbose=1)
    print(vader_output_to_label(run_vader(sent, lemmatize=True)))
    gold.append(path['sentiment'][id_])
    #all_vader_output.append(vader_output_to_label(vader_model.polarity_scores(sent)))
    all_vader_output.append(vader_output_to_label(run_vader(sent, lemmatize=True)))
print(classification_report(gold, all_vader_output, target_names=target_names))



INPUT SENTENCE I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
INPUT TO VADER ['I', 'would', 'not', 'be', 'catch', 'dead', 'watch', 'the', 'NFL', 'if', 'it', 'be', 'not', 'for', 'Taylor', 'Swift', '.']
VADER OUTPUT {'neg': 0.088, 'neu': 0.721, 'pos': 0.191, 'compound': 0.431}
positive

INPUT SENTENCE Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.
INPUT TO VADER ['Chris', "O'Donnell", 'state', 'that', 'while', 'film', 'for', 'this', 'movie', ',', 'he', 'feel', 'like', 'he', 'be', 'in', 'a', 'Toys', "''", 'R', "''", 'Us', 'commercial', '.']
VADER OUTPUT {'neg': 0.0, 'neu': 0.884, 'pos': 0.116, 'compound': 0.3612}
positive

INPUT SENTENCE The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!
INPUT TO VADER ['the', 'whole', 'game', 'be', 'a', 'rollercoaster', 'ride', ',', 'but', 'Los', 'Angeles', 'Lakers', 'ultimately', 'persevere', 'and', 'win', '!']
VADER OUT

In [9]:
pathnerc  = open('C:/Users/feter/OneDrive/Bureaublad/AI/AI Year 3/p4/text mining/train.txt')

this was used as a test set
https://www.kaggle.com/datasets/angevalli/entity-and-type-recognition-from-sentence?select=train.txt

In [10]:
training_features_kaggle = []
training_gold_labels_kaggle = []
for instance in pathnerc:
    new_instance = instance.split()
    
    if len(new_instance) > 1:
        a_dict = {'words': new_instance[0], 
       # add features
        }
        training_features_kaggle.append(a_dict)
        training_gold_labels_kaggle.append(new_instance[1])
print(training_features_kaggle[0:100]) 
print(training_gold_labels_kaggle[0:100])
print(len(training_features_kaggle), len(training_gold_labels_kaggle))

[{'words': 'EU'}, {'words': 'rejects'}, {'words': 'German'}, {'words': 'call'}, {'words': 'to'}, {'words': 'boycott'}, {'words': 'British'}, {'words': 'lamb'}, {'words': '.'}, {'words': 'The'}, {'words': 'European'}, {'words': 'Commission'}, {'words': 'said'}, {'words': 'on'}, {'words': 'Thursday'}, {'words': 'it'}, {'words': 'disagreed'}, {'words': 'with'}, {'words': 'German'}, {'words': 'advice'}, {'words': 'to'}, {'words': 'consumers'}, {'words': 'to'}, {'words': 'shun'}, {'words': 'British'}, {'words': 'lamb'}, {'words': 'until'}, {'words': 'scientists'}, {'words': 'determine'}, {'words': 'whether'}, {'words': 'mad'}, {'words': 'cow'}, {'words': 'disease'}, {'words': 'can'}, {'words': 'be'}, {'words': 'transmitted'}, {'words': 'to'}, {'words': 'sheep'}, {'words': '.'}, {'words': 'Germany'}, {'words': "'s"}, {'words': 'representative'}, {'words': 'to'}, {'words': 'the'}, {'words': 'European'}, {'words': 'Union'}, {'words': "'s"}, {'words': 'veterinary'}, {'words': 'committee'}, {'wo

In [11]:
test = pandas.read_csv('C:/Users/feter/OneDrive/Bureaublad/AI/AI Year 3/p4/text mining/NER-test.tsv', sep='\t')

In [12]:
test_features_kaggle = []
test_gold_labels_kaggle = []

for instance in test['BIO NER tag']:
    test_gold_labels_kaggle.append(instance)
    
for instance in test['token']:
    

    

        a_dict = {'words': instance, 
       # add features
        }
        test_features_kaggle.append(a_dict)
        #training_gold_labels_kaggle.append(new_instance[1])
print(test_features_kaggle[0:100]) 
print(test_gold_labels_kaggle[0:100])

[{'words': 'I'}, {'words': 'would'}, {'words': "n't"}, {'words': 'be'}, {'words': 'caught'}, {'words': 'dead'}, {'words': 'watching'}, {'words': 'the'}, {'words': 'NFL'}, {'words': 'if'}, {'words': 'it'}, {'words': 'were'}, {'words': "n't"}, {'words': 'for'}, {'words': 'Taylor'}, {'words': 'Swift'}, {'words': '.'}, {'words': 'Chris'}, {'words': "O'Donnell"}, {'words': 'stated'}, {'words': 'that'}, {'words': 'while'}, {'words': 'filming'}, {'words': 'for'}, {'words': 'this'}, {'words': 'movie'}, {'words': ','}, {'words': 'he'}, {'words': 'felt'}, {'words': 'like'}, {'words': 'he'}, {'words': 'was'}, {'words': 'in'}, {'words': 'a'}, {'words': 'Toys'}, {'words': "''"}, {'words': 'R'}, {'words': "''"}, {'words': 'Us'}, {'words': 'commercial'}, {'words': '.'}, {'words': 'The'}, {'words': 'whole'}, {'words': 'game'}, {'words': 'was'}, {'words': 'a'}, {'words': 'rollercoaster'}, {'words': 'ride'}, {'words': ','}, {'words': 'but'}, {'words': 'Los'}, {'words': 'Angeles'}, {'words': 'Lakers'}, {

In [13]:
vec = DictVectorizer()
concat_kaggle = training_features_kaggle + test_features_kaggle
the_array = vec.fit_transform(concat_kaggle)
#print(the_array)
training_features_kaggle_split = the_array[0:len(training_features_kaggle)]
test_features_kaggle_split = the_array[len(training_features_kaggle):]
the_array.shape

(203814, 23652)

In [14]:
lin_clf = svm.LinearSVC()

In [15]:
lin_clf.fit(training_features_kaggle_split, training_gold_labels_kaggle)
y_pred = lin_clf.predict(test_features_kaggle_split)
print(classification_report(test_gold_labels_kaggle, y_pred))

               precision    recall  f1-score   support

       B-DATE       0.00      0.00      0.00         1
        B-LOC       0.00      0.00      0.00         0
        B-ORG       0.00      0.00      0.00         3
        B-PER       0.40      0.67      0.50         3
     B-PERSON       0.00      0.00      0.00         3
B-WORK_OF_ART       0.00      0.00      0.00         4
       I-DATE       0.00      0.00      0.00         1
        I-LOC       0.00      0.00      0.00         0
        I-ORG       0.00      0.00      0.00         6
        I-PER       0.00      0.00      0.00         1
     I-PERSON       0.00      0.00      0.00         2
I-WORK_OF_ART       0.00      0.00      0.00         9
            O       0.87      1.00      0.93       160

     accuracy                           0.84       193
    macro avg       0.10      0.13      0.11       193
 weighted avg       0.73      0.84      0.78       193



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
