In [13]:
import pandas as pd
import numpy as np
import string
import nltk
import re
import math
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from string import punctuation

from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/My Drive/masinsko/

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/masinsko


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
class MultinomialNaiveBayes:
  def __init__(self, nb_classes, nb_words, pseudocount):
    self.nb_classes = nb_classes
    self.nb_words = nb_words
    self.pseudocount = pseudocount

  def fit(self, X, Y):
    nb_examples = X.shape[0]

    self.priors = np.bincount(Y) / nb_examples
    print('Priors:')
    print(self.priors)

    # Racunamo broj pojavljivanja svake reci u svakoj klasi
    occs = np.zeros((self.nb_classes, self.nb_words))
    for i in range(nb_examples):
      c = Y[i]
      for w in range(self.nb_words):
        cnt = X[i][w]
        occs[c][w] += cnt
    print('Occurences:')
    print(occs)

    # Racunamo P(Rec_i|Klasa) - likelihoods
    self.like = np.zeros((self.nb_classes, self.nb_words))
    for c in range(self.nb_classes):
      for w in range(self.nb_words):
        up = occs[c][w] + self.pseudocount
        down = np.sum(occs[c]) + self.nb_words*self.pseudocount
        self.like[c][w] = up / down
    print('Likelihoods:')
    print(self.like)

  def predict(self, bow):
    # Racunamo P(Klasa|bow) za svaku klasu
    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
      prob = np.log(self.priors[c])
      for w in range(self.nb_words):
        cnt = bow[w]
        prob += cnt * np.log(self.like[c][w])
      probs[c] = prob
    prediction = np.argmax(probs)
    return prediction

  def predict_multiply(self, bow):
    # Racunamo P(Klasa|bow) za svaku klasu
    # Mnozimo i stepenujemo kako bismo uporedili rezultate sa slajdovima
    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
      prob = self.priors[c]
      for w in range(self.nb_words):
        cnt = bow[w]
        prob *= self.like[c][w] ** cnt
      probs[c] = prob
    prediction = np.argmax(probs)
    return prediction

In [19]:
data = pd.read_csv("disaster-tweets.csv")

def clean(corpus):
    clean_corpus = []
    for doc in corpus:
        if doc.strip():
            words = wordpunct_tokenize(doc)
            words_lower = [w.lower() for w in words]
            words_filtered = [w for w in words_lower if w not in stop_punc]
            words_cleaned = [re.sub(r'http\S+', '', w) for w in words_filtered]
            words_cleaned = [re.sub(r'[^\w\s]', '', w) for w in words_cleaned if w != '' and 'http' not in w and 'co' not in w]
            words_cleaned = [re.sub(r'[^a-z]', '', w) for w in words_cleaned]
            words_stemmed = [porter.stem(w) for w in words_cleaned]
            if words_stemmed:
                clean_corpus.append(words_stemmed)
    return clean_corpus

corpus = data['text']
porter = PorterStemmer()

stop_punc = set(stopwords.words('english')).union(set(punctuation))
clean_corpus = clean(corpus)

vocab_set = set()
for doc in clean_corpus:
  for word in doc:
    vocab_set.add(word)
vocab = list(vocab_set)

np.set_printoptions(precision=2, linewidth=200)

def occ_score(word, doc):
   return 1 if word in doc else 0

def numocc_score(word, doc):
  return doc.count(word)

def freq_score(word, doc):
  return doc.count(word) / len(doc)

word_freq = {}
for doc in clean_corpus:
    for word in doc:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

# uzimanje 10000 najcescih reci
sorted_words = sorted(word_freq, key=word_freq.get, reverse=True)
top_words = sorted_words[:10000]
vocab_filtered = {word: idx for idx, word in enumerate(top_words)}

for score_fn in [occ_score, numocc_score, freq_score]:
    X = np.zeros((len(clean_corpus), len(vocab_filtered)), dtype=np.float32)
    for doc_idx in range(len(clean_corpus)):
        doc = clean_corpus[doc_idx]
        for word in doc:
            if word in vocab_filtered:
                word_idx = vocab_filtered[word]
                cnt = score_fn(word, doc)
                X[doc_idx][word_idx] = cnt

x_train, x_test, y_train_s, y_test_s = train_test_split(X, data['target'], test_size=0.2)
y_train = [int(label) for label in y_train_s]
y_test = [int(l) for l in y_test_s]

model = MultinomialNaiveBayes(nb_classes=2, nb_words=1000, pseudocount=1)
model.fit(x_train, y_train)

predictions = []
for sample in x_test:
    prediction = model.predict(sample)
    predictions.append(prediction)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# 4b
positive_corpus = [clean_corpus[i] for i in range(len(clean_corpus)) if data['target'][i] == 0]
negative_corpus = [clean_corpus[i] for i in range(len(clean_corpus)) if data['target'][i] == 1]

positive_word_freq = {}
negative_word_freq = {}
for doc in positive_corpus:
    for word in doc:
        if word in positive_word_freq:
            positive_word_freq[word] += 1
        else:
            positive_word_freq[word] = 1

for doc in negative_corpus:
    for word in doc:
        if word in negative_word_freq:
            negative_word_freq[word] += 1
        else:
            negative_word_freq[word] = 1

sorted_positive_words = sorted(positive_word_freq, key=positive_word_freq.get, reverse=True)
sorted_negative_words = sorted(negative_word_freq, key=negative_word_freq.get, reverse=True)

top_positive_words = [word for word in sorted_positive_words[:6] if word.strip()]
top_negative_words = [word for word in sorted_negative_words[:6] if word.strip()]

print("Top 5 words in positive tweets:", top_positive_words)
print("Top 5 words in negative tweets:", top_negative_words)

LR_scores = {}
for word in set(sorted_positive_words).intersection(set(sorted_negative_words)):
    if positive_word_freq[word] >= 10 and negative_word_freq[word] >= 10:
        LR_score = positive_word_freq[word] / negative_word_freq[word]
        LR_scores[word] = LR_score

sorted_LR_scores = sorted(LR_scores.items(), key=lambda x: x[1], reverse=True)

top_LR_words = sorted_LR_scores[:5]
bottom_LR_words = sorted_LR_scores[-5:]

print("Top 5 words with highest LR scores:", top_LR_words)
print("Top 5 words with lowest LR scores:", bottom_LR_words)


Priors:
[0.57 0.43]
Occurences:
[[3.92e+02 2.45e+01 8.45e+00 ... 5.85e-01 6.25e-02 7.34e-01]
 [3.65e+02 8.15e+00 2.07e+01 ... 4.33e-01 9.07e-01 2.61e-01]]
Likelihoods:
[[0.12 0.01 0.   ... 0.   0.   0.  ]
 [0.13 0.   0.01 ... 0.   0.   0.  ]]
Accuracy: 0.6946815495732108
Top 5 words in positive tweets: ['like', 'get', 'amp', 'new', 'go']
Top 5 words in negative tweets: ['fire', 'bomb', 'kill', 'news', 'amp']
Top 5 words with highest LR scores: [('full', 8.4), ('love', 6.944444444444445), ('obliter', 6.583333333333333), ('scream', 6.25), ('let', 5.625)]
Top 5 words with lowest LR scores: [('pm', 0.19767441860465115), ('report', 0.19117647058823528), ('train', 0.1743119266055046), ('warn', 0.16393442622950818), ('kill', 0.11801242236024845)]
