In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-positive-train-reviews/reviews-pos.csv
/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv
/kaggle/input/imdb-negative-train-reviews/reviews-neg.csv


In [3]:
train_pos_df = pd.read_csv('/kaggle/input/imdb-positive-train-reviews/reviews-pos.csv')
train_neg_df = pd.read_csv('/kaggle/input/imdb-negative-train-reviews/reviews-neg.csv')

In [4]:
train_pos_labels = np.ones(len(train_pos_df.index))
train_neg_labels = np.zeros(len(train_neg_df.index))

In [5]:
reviews = pd.concat([train_pos_df, train_neg_df]).to_numpy()

In [6]:
labels = np.concatenate((train_pos_labels, train_neg_labels))

In [7]:
import re

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

class ReviewProcessor:
    def remove_html_tags(self, text):
        return BeautifulSoup(text).get_text()
    
    def remove_non_alpha_or_space_characters(self, text):        
        return re.sub(r'[^a-zA-Z\s]', '', text)
        
    def remove_short_words(self, text):
        return re.sub(r'\b\w{1,2}\b', '', text)
        
    def remove_stop_words(self, text):
        pattern = re.compile(r'\b(' + r'|'.join(stopwords.words("english")) + r')\b\s*')
        return pattern.sub('', text)
    
    def lemmatize(self, text):                
        lemmatizer = WordNetLemmatizer()
        split_text = text.split()
        split_text[:] = [lemmatizer.lemmatize(word) for word in split_text]
        split_text[:] = [lemmatizer.lemmatize(word, pos='v') for word in split_text]
        return ' '.join(split_text)                
    
    def process(self, text):
        text = self.remove_html_tags(text)
        text = self.remove_non_alpha_or_space_characters(text)
        text = self.remove_short_words(text).lower()
        text = self.remove_stop_words(text)
        return self.lemmatize(text)

In [8]:
import math
import sys

from collections import Counter, defaultdict
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

class NaiveBayesClassifier(ClassifierMixin, BaseEstimator):
    def __init__(self, review_processor):
        self.review_processor = review_processor        
        
    def fit(self, X, y):                                                       
        X, y = check_X_y(X, y, dtype='str')
        self.classes_, self.y_ = np.unique(y, return_inverse=True)                                                     
        self.X_ = X.reshape(len(X))        
        
        self.__group_data_by_class()
        self.__compute_log_class_priors()
        
        self.vocab_ = set()          
        self.class_total_word_counts_ = defaultdict(lambda: 0)
        for c, data in self.grouped_data_.items():                        
            for index, review in enumerate(data):                                
                processed_review = self.review_processor.process(review)
                data[index] = processed_review
                
                split_review = processed_review.split()
                for word in split_review:
                    self.vocab_.add(word) 
                    
                word_counts = Counter(split_review)
                for word, count in word_counts.items():                    
                    self.class_total_word_counts_[c] += count
                    
        self.tf_idf_matrices_ = {}
        vectorizer = TfidfVectorizer(vocabulary=self.vocab_)
        for c, data in self.grouped_data_.items():                                                
            self.tf_idf_matrices_[c] = vectorizer.fit_transform(data).toarray()            
            
        self.tf_idf_matrix_feature_names_ = vectorizer.get_feature_names()
                                    
        return self
    
    def predict(self, X):        
        check_is_fitted(self)                
        X = check_array(X, dtype='str')        
        vocab_size = len(self.vocab_)        
        predictions = np.empty(len(X))        
        for index, review in enumerate(X.reshape(len(X))):            
            predictions[index] = self.__compute_maximum_a_posteriori(review, vocab_size)                    
        
        return predictions
    
    def __group_data_by_class(self):
        self.grouped_data_ = {} 
        for index, c in enumerate(self.classes_):
            self.grouped_data_[c] = self.X_[np.asarray(self.y_ == index).nonzero()]                    
    
    def __compute_log_class_priors(self):
        self.log_class_priors_ = {}
        number_of_samples = len(self.X_)
        for c in self.classes_:            
            self.log_class_priors_[c] = math.log(len(self.grouped_data_[c]) / number_of_samples)        
            
        
    def __compute_maximum_a_posteriori(self, review, vocab_size):
        max_posterior = -sys.maxsize
        most_likely_class = -sys.maxsize
        for c in self.classes_:                
            posterior = self.log_class_priors_[c]
            processed_review = self.review_processor.process(review)                
            word_counts = Counter(processed_review.split())
            total_words_in_class_reviews = self.class_total_word_counts_[c]  
            tf_idf_matrix_column_sums = self.tf_idf_matrices_[c].sum(axis=0)
            for index, word in enumerate(self.vocab_):
                word_count = word_counts[word]
                if word_count == 0: 
                    continue
                tf_idf_matrix_word_column_index = self.tf_idf_matrix_feature_names_.index(word)
                tf_idf_matrix_column_sum = tf_idf_matrix_column_sums[tf_idf_matrix_word_column_index]
                laplace_probability = (tf_idf_matrix_column_sum + 1) / (total_words_in_class_reviews + vocab_size)                
                posterior += (word_count * math.log(laplace_probability))
            if posterior > max_posterior:
                max_posterior = posterior
                most_likely_class = c
        
        return most_likely_class        

In [23]:
def scorer(estimator, X, y):
    predictions = estimator.predict(X)
    correct_predictions = (predictions == y)    
    return correct_predictions.astype(int).mean()

In [None]:
from sklearn.model_selection import cross_validate

naive_bayes_classifier = NaiveBayesClassifier(ReviewProcessor())
cv_model = cross_validate(
    naive_bayes_classifier, reviews, labels, cv=2,
    scoring=scorer, return_estimator=True, return_train_score=True
)

In [None]:
# print(sorted(test_scores.keys()))

In [None]:
# key = 'train_score'
# print(type(test_scores[key]))
# print(test_scores[key])

In [14]:
# # Tests
# # =====

# test_review_1 = "This movie was exceptional!"
# test_review_2 = "I really, really enjoyed 'The Rising'."
# test_review_3 = "I didn't like this film."
# test_review_4 = "I'd recommend this show to anyone."
# test_review_5 = "You'd be silly to spend money on this."
# test_review_6 = "The show was fine sometimes but mostly boring."
# test_reviews_df = pd.DataFrame({'col': [test_review_1, test_review_2, test_review_3, test_review_4, test_review_5, test_review_6]})
# test_X = test_reviews_df.to_numpy()
# test_labels = [1, 1, 0, 1, 0, 0]
# test_y = np.array(test_labels)
# # test_clf = NaiveBayesClassifier(ReviewProcessor())
# # test_clf = test_clf.fit(test_X, test_y)

In [33]:
# from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate

# # Might be simpler to use cross_val_score
# test_naive_bayes_classifier = NaiveBayesClassifier(ReviewProcessor())
# test_scores = cross_validate(
#     test_naive_bayes_classifier, test_X, test_y, scoring=scorer, 
#     cv=RepeatedStratifiedKFold(n_splits=2, n_repeats=2),
#     return_estimator=True, return_train_score=True
# )

In [34]:
# print(sorted(test_scores.keys()))

['estimator', 'fit_time', 'score_time', 'test_score', 'train_score']


In [38]:
# key = 'train_score'
# print(type(test_scores[key]))
# print(test_scores[key])

<class 'numpy.ndarray'>
[1. 1. 1. 1.]


In [31]:
# test_predictions = test_clf.predict(test_X)
# print(test_predictions)

[1. 1. 0. 1. 0. 0.]


In [32]:
# # More tests
# # ==========

# test_test_review_1 = "The movie is exceptional!"
# test_test_review_2 = "I really liked 'The Rising'."
# test_test_review_3 = "I didn't like this show."
# test_test_review_4 = "I'd recommend this show to friends."
# test_test_review_5 = "You'd be stupid to spend money on this."
# test_test_review_6 = "The show was okay sometimes but mostly dull."
# test_test_list = [test_test_review_1, test_test_review_2,
#                   test_test_review_3, test_test_review_4,
#                   test_test_review_5, test_test_review_6]
# test_test_df = pd.DataFrame(test_test_list)
# test_test_X = test_test_df.to_numpy()

# test_test_predictions = test_clf.predict(test_test_X[3:])
# print(test_test_predictions)

[1. 0. 0.]


In [33]:
# # More tests
# # ==========

# test_test_review_1 = "Far away is a quality piece of showmanship!"
# test_test_review_2 = "I really enjoyed 'The Parachute'."
# test_test_review_3 = "This one wasn't for me."
# test_test_review_4 = "I'd tell any person to check this one out."
# test_test_review_5 = "Unfortunately, this was dry, very dry."
# test_test_review_6 = "I hate these kinds of movies!"
# test_test_list = [test_test_review_1, test_test_review_2,
#                   test_test_review_3, test_test_review_4,
#                   test_test_review_5, test_test_review_6]
# test_test_df = pd.DataFrame(test_test_list)
# test_test_X = test_test_df.to_numpy()

# test_test_predictions = test_clf.predict(test_test_X)
# print(test_test_predictions)

[0. 1. 0. 0. 0. 1.]


In [50]:
# # Check if estimator passes sklearn tests
# from sklearn.utils.estimator_checks import check_estimator

# # pass generate_only=True to run all checks instead of failing at the first error
# clf = NaiveBayesClassifier(ReviewProcessor())
# generator = check_estimator(clf, generate_only=True) 
# for estimator, check in generator:
#     try:
#         check(estimator)
#     except Exception as e:
#         print(e, '\n')

cannot reshape array of size 100 into shape (20,) 

cannot reshape array of size 90 into shape (30,) 

cannot reshape array of size 42 into shape (21,) 

cannot reshape array of size 42 into shape (21,) 

"Complex data not supported" does not match "empty vocabulary passed to fit" 

cannot reshape array of size 400 into shape (40,) 

cannot reshape array of size 90 into shape (30,) 

Estimator doesn't check for NaN and inf in fit. NaiveBayesClassifier(review_processor=<__main__.ReviewProcessor object at 0x7f9187b5e950>) cannot reshape array of size 30 into shape (10,)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/estimator_checks.py", line 1463, in check_estimators_nan_inf
    estimator.fit(X_train, y)
  File "<ipython-input-48-2c8ae059d053>", line 16, in fit
    self.X_ = X.reshape(len(X))
ValueError: cannot reshape array of size 30 into shape (10,)
cannot reshape array of size 30 into shape (10,) 

cannot reshape array of size 42 into

