# **download data**

# [Naive Bayesian text classification](https://)
 

## **Importing Related Library**

In [6]:
import re
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

## Data preprocessing

In [7]:
def clean_str(string):
 """
 Tokenization/string cleaning for all datasets except for SST.
 Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 """
 string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 string = re.sub(r"\'s", " \'s", string)
 string = re.sub(r"\'ve", " \'ve", string)
 string = re.sub(r"n\'t", " n\'t", string)
 string = re.sub(r"\'re", " \'re", string)
 string = re.sub(r"\'d", " \'d", string)
 string = re.sub(r"\'ll", " \'ll", string)
 string = re.sub(r",", " , ", string)
 string = re.sub(r"!", " ! ", string)
 string = re.sub(r"\(", " \( ", string)
 string = re.sub(r"\)", " \) ", string)
 string = re.sub(r"\?", " \? ", string)
 string = re.sub(r"\s{2,}", " ", string)
 return string.strip().lower()
 


In [16]:

def load_data_and_labels(positive_data_file, negative_data_file):
 """
 Loads MR polarity data from files, splits the data into words and generates labels.
 Returns split sentences and labels.
 """
 # Load data from files
 positive_examples = list(open(positive_data_file, "r", encoding='iso-8859-1').readlines()) 
 positive_examples = [s.strip() for s in positive_examples]
 negative_examples = list(open(negative_data_file, "r", encoding='iso-8859-1').readlines()) 
 negative_examples = [s.strip() for s in negative_examples]
 # Split by words
 x = positive_examples + negative_examples
 x = [clean_str(sent) for sent in x]
 x = np.array(x)
 # Generate labels
 positive_labels = [1] * len(positive_examples)
 negative_labels = [0] * len(negative_examples)
 y = np.concatenate([positive_labels, negative_labels], 0)
 
 
 shuffle_indices = np.random.permutation(np.arange(len(y)))
 shuffled_x = x[shuffle_indices]
 shuffled_y = y[shuffle_indices]
 return shuffled_x, shuffled_y


Load data:

In [17]:

positive_data_file = 'data/rt-polarity.pos'
negative_data_file = 'data/rt-polarity.neg'
x, y = load_data_and_labels(positive_data_file, negative_data_file)

show data

In [18]:
x[:5]


array(['flounders due to the general sense that no two people working on the production had exactly the same thing in mind',
       "cusack 's just brilliant in this",
       'there are a couple of things that elevate glory above most of its ilk , most notably the mere presence of duvall',
       'too silly to take seriously',
       'the only way to tolerate this insipid , brutally clueless film might be with a large dose of painkillers'],
      dtype='<U266')

Show data labels: 


In [19]:
y[:5]


array([0, 1, 1, 0, 0])

split and map

In [20]:
test_size = 2000
x_train, y_train = x[:-2000], y[:-2000]
x_test, y_test = x[-2000:], y[-2000:]
label_map = {0: 'negative', 1: 'positive'}


Define the main class of the classifier and define the training and test functions.

In [26]:
class NB_Classifier(object):
 
 def __init__(self):
   # naive bayes
   self.model = MultinomialNB( alpha=1) #Laplace smooth：1 
   # use tf-idf extract features
   self.feature_processor = TfidfVectorizer()
 
 def fit(self, x_train, y_train, x_test, y_test):
   # tf-idf extract features
   x_train_fea = self.feature_processor.fit_transform(x_train)
   self.model.fit(x_train_fea, y_train)
 
   train_accuracy = self.model.score(x_train_fea, y_train)
   print("Training Accuracy：{}".format(round(train_accuracy, 3)))
 
   x_test_fea = self.feature_processor.transform(x_test)
   y_predict = self.model.predict(x_test_fea)
   test_accuracy = accuracy_score(y_test, y_predict)
   print("Test Accuracy：{}".format(round(test_accuracy, 3)))
 
   y_predict = self.model.predict(x_test_fea)
   print('Test set evaluate：')
   print(classification_report(y_test, y_predict, target_names=['0', '1']))
 
 def single_predict(self, text):
   text_fea = self.feature_processor.transform([text])
   predict_idx = self.model.predict(text_fea)[0]
   predict_label = label_map[predict_idx]
   predict_prob = self.model.predict_proba(text_fea)[0][predict_idx]
 
   return predict_label, predict_prob


## Initialize and train the classifier.



In [27]:
nb_classifier = NB_Classifier()
nb_classifier.fit(x_train, y_train, x_test, y_test)

Training Accuracy：0.928
Test Accuracy：0.774
Test set evaluate：
              precision    recall  f1-score   support

           0       0.78      0.77      0.78      1016
           1       0.77      0.78      0.77       984

    accuracy                           0.77      2000
   macro avg       0.77      0.77      0.77      2000
weighted avg       0.77      0.77      0.77      2000



### Single sentence test

In [28]:
nb_classifier.single_predict("beautiful actors, great movie")


('positive', 0.7148445135187691)

In [29]:
nb_classifier.single_predict("it's really boring")


('negative', 0.8787393394531633)