In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
import pandas as pd
from sklearn.metrics import accuracy_score

In [4]:
class MultiNB():
    def __init__(self):
        self.theta_y = None
        self.theta_y_one = None
        self.theta_y_zero = None
    
    def fit(self,X,y):
        m,n = X.shape
        positive = np.where(y==1)
        negative = np.where(y==0)
        self.theta_y = np.array(positive).shape[1]/m
        X_positive = X[positive]
        X_negative = X[negative]
        
        
        theta_y1 = np.sum(X[positive],axis=0)+1
        V = n
        d_positive = np.sum(np.count_nonzero(X_positive.toarray(),axis=1))
        self.theta_y_one = theta_y1/(V+d_positive)


        theta_y0 = np.sum(X[negative],axis=0)+1
        d_negative = np.sum(np.count_nonzero(X_negative.toarray(),axis=1))
        self.theta_y_zero = theta_y0/(V+d_negative)
        return
        
    def predict(self,X):
        nonzero = np.array(np.nonzero(X))
        X[nonzero[0],nonzero[1]] = self.theta_y_one[:,nonzero[1]]
        maska = np.ma.masked_equal(X.toarray(),0)
        prodcuts = np.array([0]).astype(np.longdouble)
        products = maska.product(axis=1).data
        ones_prod = products * self.theta_y


        nonzero = np.array(np.nonzero(X))
        X[nonzero[0],nonzero[1]] = self.theta_y_zero[:,nonzero[1]]
        maska = np.ma.masked_equal(X.toarray(),0)
        prodcuts = np.array([0]).astype(np.longdouble)
        products = maska.product(axis=1).data
        zero_prod = products * (1-self.theta_y)

        
        with np.errstate(divide='ignore', invalid='ignore'):
            denominator = ones_prod + zero_prod
            output  = np.true_divide(ones_prod,denominator)
            output[output== np.inf] = 0
            output = np.nan_to_num(output)

        output= np.round(output)
        return output



In [5]:
import re

import nltk
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer 


df = pd.read_csv('./IMDB.csv')
df = df[:3000]

def get_text(text):
    soup = BeautifulSoup(text,'html.parser')
    return soup.get_text()

def rm_special(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
df['review'] = df['review'].apply(get_text)
df['review'] = df['review'].apply(rm_special)
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

stop = set(stopwords.words('english'))

def lemma(text):
    lemmatizer = WordNetLemmatizer() 
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop]
    text = " ".join(text)
    return text
df['review'] = df['review'].apply(lemma)



def stop_words_token(text):
    tokenizer = ToktokTokenizer()
    tokened = tokenizer.tokenize(text)
    text = [token for token in tokened if token not in stop]
    text = ' '.join(text) 
    return text
df['review'] = df['review'].apply(stop_words_token)

y = df['sentiment']
df.drop(['sentiment'],axis=1,inplace=True)

X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.2)




In [6]:
tv=TfidfVectorizer(min_df=0,max_df=1,ngram_range=(1,3))
#tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True)
train_data = tv.fit_transform(X_train['review'])
test_data = tv.transform(X_test['review'])

In [7]:
nb = MultiNB()
nb.fit(train_data,y_train)
pred = nb.predict(test_data)
accuracy_score(y_test,pred)

0.685

In [8]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_data,y_train)
pred = nb.predict(test_data)
accuracy_score(y_test,pred)

0.5033333333333333