###Apply Naive Bayes algorithm on [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) dataset.

In [54]:
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/MyDrive

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive'
/content/drive/MyDrive


In [55]:
from pathlib import Path
import os
DATA_PATH=Path('/content/drive/MyDrive/NLP')
DATA_PATH.mkdir(exist_ok=True)
if not os.path.exists('/content/drive/MyDrive/NLP/aclImdb'):
     !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz  
     !tar -xf aclImdb_v1.tar.gz -C {DATA_PATH}

In [56]:
import numpy as np
CLASSES = ['neg', 'pos']
PATH=Path('/content/drive/MyDrive/NLP/aclImdb')

def get_texts(path):
    texts,labels = [],[]
    for idx,label in enumerate(CLASSES):
        for fname in (path/label).glob('*.txt'):
            texts.append(fname.open('r', encoding='utf-8').read())
            labels.append(idx)
    #return np.array(texts),np.array(labels)
    return texts, labels

In [57]:
train_texts,train_labels = get_texts(PATH/'train')
test_texts,test_labels = get_texts(PATH/'test')

In [58]:
import pandas as pd
train_df = pd.DataFrame(train_texts, columns=["text"])
train_df["target"] = train_labels

In [59]:
train_df.sample(5)

Unnamed: 0,text,target
11016,"First of all, if you'r a fan of the comic, wel...",0
8469,"Ah, the spirit of '68. The streets of Paris we...",0
9495,"My favorite quote from Crow was, when the car ...",0
3789,I saw most of the episodes of RMFTM as a teena...,0
3430,"Before I begin, you need to know that I am a h...",0


In [60]:
import string
import re
import nltk 
import unicodedata 
from nltk.corpus import stopwords 
nltk.download('stopwords')
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')


from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [61]:
def clean_texts(text):
    #to_lowercase
    text=text.lower() 

    # remove special chars
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # remove_non_ascii
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
     
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)

    #replace all interger occurrences in list of tokenized words with textual representation"""
    text=re.sub(r'\d+', '', text)

    # remove hashtags
    text = re.sub(r'#', '', text)

    # remove extra whitespace
    text = re.sub(' +', ' ', text)
    

    # tokenize tweets
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    text_tokens = tokenizer.tokenize(text)
   
   
     
    clean_text = []   
    for word in text_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in string.punctuation): # remove punctuation
           
            #stem_word = stemmer.stem(word) # stemming word

            lemmatizer = WordNetLemmatizer()
            lem_word=lemmatizer.lemmatize(word)
            lem_word=lemmatizer.lemmatize(lem_word, pos='v') 
            clean_text.append(lem_word)
            

    return clean_text

In [62]:
train_df["text"] = train_df.text.apply(clean_texts)
train_df.sample(5)

Unnamed: 0,text,target
4858,"[movie, poor, attempt, make, money, use, class...",0
2611,"[beloved, actor, peter, falk, rip, tear, georg...",0
40,"[igor, lunatic, totally, inept, amateurish, at...",0
8397,"[feel, movie, portray, smith, historically, go...",0
4119,"[good, actor, good, performance, cant, mask, p...",0


In [63]:
def build_freq(texts, labels):
    freqs = {}
    for i in range(len(texts)):
      for word in texts[i]:
        if word not in freqs.keys():
          if labels[i] == 0:
            freqs[word] = [0, 1]
          else:
            freqs[word] = [1, 0]

        else:
          if labels[i] == 0:
            freqs[word][1] += 1
          else:
            freqs[word][0] += 1

    return freqs

freq_table = build_freq(train_df['text'], train_df['target'])
v = len(freq_table)
print(v)

65842


In [64]:
freq_table

{'although': [29, 1009],
 'use': [40, 2297],
 'site': [3, 139],
 'quite': [32, 1583],
 'frequently': [3, 68],
 'see': [245, 9725],
 'people': [123, 4759],
 'rat': [8, 949],
 'think': [102, 6397],
 'challenge': [2, 140],
 'plain': [4, 410],
 'enjoyable': [15, 253],
 'film': [537, 21904],
 'watch': [109, 7239],
 'movie': [397, 27800],
 'four': [8, 403],
 'last': [23, 1481],
 'night': [21, 1084],
 'felt': [14, 828],
 'compel': [8, 174],
 'write': [32, 2310],
 'something': [30, 2834],
 'even': [130, 7702],
 'help': [25, 1164],
 'cleanse': [0, 12],
 'againbr': [2, 90],
 'br': [529, 29831],
 'possibly': [10, 427],
 'shallowest': [0, 1],
 'experience': [15, 563],
 'ive': [30, 1812],
 'ever': [51, 3171],
 'main': [19, 1217],
 'character': [127, 6935],
 'play': [101, 3524],
 'danny': [1, 84],
 'dyer': [0, 25],
 'sure': [21, 1409],
 'gillian': [0, 19],
 'anderson': [0, 107],
 'always': [32, 1138],
 'scully': [0, 8],
 'leonard': [0, 71],
 'nimoy': [0, 8],
 'spock': [0, 14],
 'real': [57, 2081],
 

In [65]:
n_pos = 0
n_neg = 0
for key in freq_table.keys():
  n_pos += freq_table[key][0]
  n_neg += freq_table[key][1]
print(n_pos, n_neg)

30729 1512505


In [66]:
p_pos = n_pos/(n_pos+n_neg)
p_neg = n_neg/(n_pos+n_neg)
print(p_pos, p_neg)

0.01991208073435396 0.9800879192656461


In [67]:
def build_conditional_table(freqs): 
    conds = {}
    for word in freqs.keys():
      conds[word] = [(freqs[word][0] + 1)/(n_pos+v), (freqs[word][1] + 1)/(n_neg+v)]
    return conds
cond_props = build_conditional_table(freq_table)
cond_props

{'although': [0.000310652266208282, 0.0006399099817720692],
 'use': [0.00042455809715131874, 0.0014559536020913018],
 'site': [4.142030216110426e-05, 8.870039351296008e-05],
 'quite': [0.0003417174928291102, 0.0010035815951752055],
 'frequently': [4.142030216110426e-05, 4.371662251710175e-05],
 'see': [0.002547348582907912, 0.0061621430521932126],
 'people': [0.0012840293669942322, 0.0030158133794406425],
 'rat': [9.319567986248459e-05, 0.000601895527409372],
 'think': [0.0010665727806484348, 0.004053607983542275],
 'challenge': [3.10652266208282e-05, 8.933396775233837e-05],
 'plain': [5.177537770138033e-05, 0.00026039901238447563],
 'enjoyable': [0.00016568120864441705, 0.0001609278568020847],
 'film': [0.005571030640668524, 0.013878443713581361],
 'watch': [0.0011390583094303673, 0.004587077493098793],
 'movie': [0.004121320065029874, 0.01761399742895574],
 'four': [9.319567986248459e-05, 0.00025596399270882763],
 'last': [0.0002485218129666256, 0.0009389570227586202],
 'night': [0.0

In [75]:
def naive_bayes(text, cond_prop_table):
    sum_ = 0
    for word in text:
      sum_ += np.log10(cond_prop_table.get(word, 0)[0]/cond_prop_table.get(word, 0)[1] )
    return "positive" if sum_ * np.log10(p_pos/p_neg) > 0 else "negative"  

## Make test for sample

In [76]:
sample= train_df["text"].iloc[10]
naive_bayes(sample, cond_props)

'positive'