In [37]:
import numpy as np
import pandas as pd
import re
import math

In [38]:
data = pd.read_csv(r"TEXT_DATA.csv")
data

Unnamed: 0,Y,X
0,Negative,just plain boring
1,Negative,entirely predictable and lacks energy
2,Negative,no suprises and very few laughs
3,Positive,very powerful
4,Positive,the most fun film of the summer
5,Positive,Chinese Bejing Chinese
6,Positive,Chinese Chinese Shingai
7,Positive,Chinese Macao
8,Negative,Tokyo Japan Chinese


In [39]:
'''
prob_class(data) -> Defines the probabilties(prior probabilities of the class) of the class. P(-), p(+). Returns a dictionary of classes with probability.
{'Negative': 0.6, 'Positive': 0.4}

tokenize(string) -> Takes a string as input and seprates the words(containing alphanumeric characters) and covert them to lower case. Output a list of words.
['just', 'plain', 'boring']

vocabulary(data) -> Output the count of distinct words in the data.

class_data(data, c) -> Outputs the data for a particular class.

class_data_tokens(data, c) -> Outputs the count of words in a particular class. + -> 9, - -> 14

word_count_class(data, c, word) -> Outputs the count of occurences of the word in the class c.

tokenize_unique(string) -> Outputs a string having the frequncies of unique words in the input string.

predict(data, text) -> Applies the naive bayes algorithm to find the P(c)P(Sentence) and outputs the probabilities also the class of higher probabilities which gives the predited class for the given text.
'''


"\nprob_class(data) -> Defines the probabilties(prior probabilities of the class) of the class. P(-), p(+). Returns a dictionary of classes with probability.\n{'Negative': 0.6, 'Positive': 0.4}\n\ntokenize(string) -> Takes a string as input and seprates the words(containing alphanumeric characters) and covert them to lower case. Output a list of words.\n['just', 'plain', 'boring']\n\nvocabulary(data) -> Output the count of distinct words in the data.\n\nclass_data(data, c) -> Outputs the data for a particular class.\n\nclass_data_tokens(data, c) -> Outputs the count of words in a particular class. + -> 9, - -> 14\n\nword_count_class(data, c, word) -> Outputs the count of occurences of the word in the class c.\n\ntokenize_unique(string) -> Outputs a string having the frequncies of unique words in the input string.\n"

In [40]:
def prob_class(data):

    classes = {}
    total = len(data['Y'])

    for c in data['Y']:
        if c in classes:
            classes[c] += 1
        else:
            classes[c] = 1
    
    for c in classes:
        temp = classes[c]

        classes[c] = temp/total
    
    return classes

prob_class(data)

{'Negative': 0.4444444444444444, 'Positive': 0.5555555555555556}

In [41]:
def tokenize(string):

    words = re.sub("[^\w]"," ",string).split()
    words = list(map(lambda word:word.lower(), words))

    return words

tokenize("Hello name is")

['hello', 'name', 'is']

In [42]:
def vocabulary(data):

    count = 0
    words = []

    for sentence in data['X']:
        tokens = tokenize(sentence)

        for token in tokens:
            if token not in words:
                words.append(token)
                count += 1
        
    return count

vocabulary(data)

26

In [43]:
def class_data(data, c):
    return data.loc[data['Y']==c]

class_data(data, "Positive")

Unnamed: 0,Y,X
3,Positive,very powerful
4,Positive,the most fun film of the summer
5,Positive,Chinese Bejing Chinese
6,Positive,Chinese Chinese Shingai
7,Positive,Chinese Macao


In [44]:
def class_data_tokens(data, c):

    df = class_data(data, c)

    count = 0

    for sentence in df['X']:
        tokens = tokenize(sentence)

        for token in tokens:
            count += 1
    
    return count

class_data_tokens(data, 'Negative')

17

In [45]:
def word_count_class(data, c, word):

    df = class_data(data,c)

    count = 0

    for sentence in df['X']:
        tokens = tokenize(sentence)

        for token in tokens:
            if token==word:
                count += 1
    
    return count

word_count_class(data, "Positive", "fun")

1

In [46]:
def tokenize_unique(text):

    tokens = tokenize(text)

    temp = {}

    for token in tokens:
        if token in temp:
            temp[token] += 1
        else:
            temp[token] = 1

    return temp

tokenize_unique("predictable no with no fun")

{'predictable': 1, 'no': 2, 'with': 1, 'fun': 1}

In [47]:
def predict(data, text):

    tokens = tokenize_unique(text)

    class_prob_data = {}

    for c in set(data['Y']):

        total_tokens = class_data_tokens(data,c)
        class_prob_data[c] = {}

        for token in tokens:

            freq = tokens[token]
            freq_word_in_class = word_count_class(data,c,token)

            prop_word_class = (freq_word_in_class+1)/(vocabulary(data)+class_data_tokens(data, c))

            prop_word_class_all = math.pow(prop_word_class, freq)

            class_prob_data[c][token] = prop_word_class_all

    final = {}

    for c in prob_class(data):
        p = prob_class(data)[c]

        m = np.prod(list(class_prob_data[c].values()))

        final[c] = m*p

    print(final)

    return max(final, key=lambda x:final[x])

In [48]:
predict(data, "predictable with no fun")

{'Negative': 5.200003679002601e-07, 'Positive': 3.2500022993766266e-07}


'Negative'

In [49]:
predict(data, "Chinese Chinese Chinese Tokyo Japan")

{'Negative': 9.674425449307166e-08, 'Positive': 8.162796472852923e-07}


'Positive'