In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from csv import reader
import re
import spacy
import pickle
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()

nlp = spacy.load('en_core_web_sm')



with open('vocabulary.txt', 'rb') as handle:
    vocabulary = pickle.loads(handle.read())
    
with open('logprior.txt', 'rb') as handle:
    logprior = pickle.loads(handle.read())
    
with open('logLikelihoodPositive.txt', 'rb') as handle:
    logLikelihoodPositive = pickle.loads(handle.read())
    
with open('logLikelihoodNegative.txt', 'rb') as handle:
    logLikelihoodNegative = pickle.loads(handle.read())

    

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags = re.MULTILINE)
    text = re.sub(r'\@\w+|\#', "", text)
    text = re.sub(r"[^a-zA-Z. ]","",text)
    text = re.sub(r'\.+', ".",text)
    text_tokens = word_tokenize(text)
    filtered_words = [word for word in text_tokens if word not in stop_words]
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]  
    return " ".join(lemma_words)

    
my_dict ={}
actualArray =[]
predictedArray = []
    
with open('TestDataset - Sheet1.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    next(csv_reader)
    for row in csv_reader:
        row[1] = row[1].split(',')
        actualArray.append(row[3])
        my_dict[row[0]] = []
        with open(row[4],'r') as file:
            sentence = file.read()
        sentences = sentence.split('.')
        for sentence in sentences:
            doc = nlp(sentence)
            for ent in doc.ents:
                if ent.text in row[1]:
                    sentence = preprocess_text(sentence)
                    my_dict[row[0]].append(sentence)

# print(my_dict['Tony Stark'])


[nltk_data] Downloading package stopwords to /home/rutuja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rutuja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rutuja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
final_dict = {}


for character in my_dict:
    print(character)
    positiveSentences = 0
    negativeSentences = 0
    for sentence in my_dict[character]:
        sumPositive = logprior["positive"]
        sumNegative = logprior["negative"] 
#         words = sentence.split(" ")
        words = sentence.split(" ") 
      
        for word in words:           
            if word in vocabulary:
                sumPositive = sumPositive + logLikelihoodPositive[word]
                sumNegative = sumNegative + logLikelihoodNegative[word]
      
        if abs(sumPositive - sumNegative) >= 0:
            if sumPositive> sumNegative:
                positiveSentences = positiveSentences + 1
            else:
                negativeSentences = negativeSentences + 1
   
    total = positiveSentences + negativeSentences
    if negativeSentences >= 0.30*total:
        final_dict[character] = "evil"
    else:
        final_dict[character] = "good"
    predictedArray.append(final_dict[character]) 
    


correct = 0
length = len(actualArray)
for i in range(length):
    if actualArray[i]==predictedArray[i]:
        correct = correct + 1
      

        
accuracy = correct/length 
print("correct predictions are")
print(correct)
print("accuracy is ")
print(accuracy)

print(final_dict)  

final_dict ={}

Tony Stark
Gabbar
Sauron
Khilji
Bhallaladeva
Amit Shellar
Gandalf
Raju
Mogambo
Kancha
Kaal
Loki
Thanos
Peter Parker
Valentine
Venom
Otto Octavius
Scar
Simba
Lady Tremaine
Shere Khan
Mowgli
Sid Phillips
Woody
Evelyn
Bob
Dolores Umbridge
Robert Callaghan
Jafar
Gaston
Elsa
Maleficent
al Ghul
Kaecilius
Strange
Batman
Harry
Amarendra Bahubali
Bilbo Baggins
Thor
Saruman
Frodo
Farhan
Louisa
Biff Tannen
Hans Gruber
Chucky
Jack Dawson
William
Mark Watney
Rhett
Jim
Forrest
Mia
Simran
Hazel
Holmes
John Watson
Tim
Mary
correct predictions are
33
accuracy is 
0.55
{'Tony Stark': 'good', 'Gabbar': 'evil', 'Sauron': 'good', 'Khilji': 'good', 'Bhallaladeva': 'evil', 'Amit Shellar': 'good', 'Gandalf': 'good', 'Raju': 'evil', 'Mogambo': 'evil', 'Kancha': 'good', 'Kaal': 'evil', 'Loki': 'evil', 'Thanos': 'evil', 'Peter Parker': 'good', 'Valentine': 'evil', 'Venom': 'good', 'Otto Octavius': 'good', 'Scar': 'good', 'Simba': 'good', 'Lady Tremaine': 'good', 'Shere Khan': 'good', 'Mowgli': 'good', 'Sid Phill