#The Multinomial Naive Bayes algorithm is a Bayesian learning approach. 
 
 In the current problem, the program makes an educated guess as to the country tag of a name, such as the country  
 where the given name is popular, using the Bayes theorem. 
 
 It determines the likelihood of each tag for a particular sample and outputs the tag with the highest likelihood.

##Working

The Multinomial Naive Bayes method works well while examining text input and solving the problem that involve several
 classes. 
 
On the basis of prior knowledge of the event's conditions, it calculates the likelihood of occurrence. 
 
 Here, we determine the country tag provided the name.

Merging all the names and split them into training (70%) and testing (30%) with shuffle = True.

In [None]:
file_of_names = ["arabic.txt","us.txt","greek.txt","japan.txt"]
country = []
names = []
for name in file_of_names:
    with open("/content/sample_data/"+name) as file:
        for line in file:
            names.append(line[0:-1])
            country.append(name[0:-4])

In [None]:
import pandas as pd

combined_names = pd.DataFrame({'names':names,'country':country})
combined_names.sample(10)

Unnamed: 0,names,country
478,الأستاذ عدوي العجمان,arabic
3245,吉田 千代,japan
2894,Ερμόλαος Νικουλής,greek
207,بهية الشهابي,arabic
1134,Amanda Lamb,us
3944,井上 明美,japan
2752,Βαλεντίνα-Κωνσταντίνα Παπακωνσταντίνου,greek
3652,佐々木 太一,japan
1963,Judy Dean MD,us
652,كسّاب قليبو,arabic


Unnamed: 0,names,country
490,جلنار فراهيد,arabic
2617,Παρέσσα-Πολυξένη Ιωακειμίδου,greek
233,الأستاذ نجم الدّين الخزرج,arabic
3509,山崎 美加子,japan
1357,Steven Murphy,us
3930,加藤 太郎,japan
992,عبد العزيز جار الله,arabic
3307,中村 美加子,japan
2748,Πούλια Παυλή,greek
159,الدكتورة اصيل مرازيق البقوم,arabic


In [None]:
# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = names
trainDF['label'] = country

In [None]:
from sklearn import preprocessing
import numpy as np

# preprocessing logic is referenced from https://towardsdatascience.com/name-classification-with-naive-bayes-7c5e1415788a
# creating mapping from unique label texts to unique integers
encoder = preprocessing.OrdinalEncoder()

# using the encoder to encode the entire dataset
y = encoder.fit_transform(combined_names['country'].values.reshape(-1,1))

In [None]:
from sklearn.model_selection import train_test_split

# split the dataset into training and validation datasets 
x_train, x_test, y_train, y_test = train_test_split(combined_names['names'], y, test_size=0.3, shuffle = True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def count_vectorize_label(names):
# Initialize and fit CountVectorizer with given text documents
  vectorizer = CountVectorizer().fit(names)

# use the vectorizer to transform the document into word count vectors (Sparse)
  word_mat = vectorizer.transform(names)
  print(len(vectorizer.get_feature_names_out()))
  return word_mat

In [None]:
train_data = {'names': np.array(x_train), 'country': np.array(y_train.flatten())}
train_df = pd.DataFrame(data=train_data)
#train_df = train_df.sort_values(by=['country'], ascending=True)
train_df = train_df.set_index('country', append=True).sort_index(level=1).reset_index(level=1)
x_train_sorted = train_df['names']
y_train_sorted = train_df['country']

Count Vectorizing training and test names:

In [None]:
train_names = []
names_index = [0]
index = 0
for i in range (0,4):
   train_names.append(np.array(x_train_sorted)[min(np.where(y_train_sorted == i)[0]): max(np.where(y_train_sorted == i)[0] + 1)])
   index += len(train_names[i])
   names_index.append(index)

combined_names = np.append(np.array(x_train_sorted), np.array(x_test))
word_mat = count_vectorize_label(combined_names)
train_word_mat = word_mat[0:len(train_df)]
test_word_mat = word_mat[len(train_df):word_mat.shape[0]]
names_index

3043


[0, 732, 1424, 2108, 2800]

3043


[0, 698, 1410, 2106, 2800]

Calculating probability of each class:

In [None]:
#probability of each class
p_country = {}
for i in range (0,4):
  p_country[i] = np.count_nonzero(y_train == [i])/len(y_train)

In [None]:
#logic for this function is referred through youtube video (https://www.youtube.com/watch?v=mqYa0LaA9WI)
def calc_probability(a, country, nonzero_columns, test_name_arr):
  #start and end index of training word matrix
  if(country == 0):
    begin = names_index[country]
  else:
    begin = names_index[country] + 1
  end = names_index[country+1] 

  p_xi_given_country = []
  p_xi = []
  unique_columns = word_mat.shape[1]
  sum_all_columns = 0

  for j in range(0,train_word_mat.shape[1]):
    all_rows = train_word_mat.getcol(j).toarray().flatten()
    country_rows = all_rows[begin:end]
    sum_all_columns += sum(country_rows)

  for i in range(0, train_word_mat.shape[1]):
    all_rows = train_word_mat.getcol(i).toarray().flatten()
    country_rows = all_rows[begin:end]
    sum_all_columns += sum(country_rows)

    complete_arr = word_mat.getcol(i).toarray().flatten()
    country_arr = complete_arr[begin:end]

    #calculating conditional probability with smootheneing
    p_xi_given_country.append(((sum(country_arr)+a) / (sum_all_columns + (a*unique_columns))) ** test_name_arr[i])
     
  posterior_prob = np.prod(p_xi_given_country) * p_country[country]
  return posterior_prob

In [None]:
def predict_country(a, index, test_word_mat):
  y_preds = []
  test_name_arr = test_word_mat.getrow(index).toarray().flatten()
  nonzero_columns = np.nonzero(test_name_arr)[0]
  p = []

  for i in range(0,4):  
    p.append(calc_probability(a,i,nonzero_columns, test_name_arr))

  return p.index(max(p))

Prediction and calculation of accuracy:

In [None]:
from sklearn.metrics import confusion_matrix

def calc_accuracy(a, test_word_mat, start, end):
  y_preds = []

  for i in range(start, end):
    y_preds.append(predict_country(a, i, test_word_mat))

  correct_predictions = sum(a == b for a,b in zip([int(y) for y in y_test], y_preds))
  total_predictions = len(y_preds)
  accuracy = correct_predictions / total_predictions
  return y_preds, accuracy

Calculating accuracy of 10 records only for showing the accuracy calculation and prediction purpose.

In [None]:
a = 0.001
start = 0
end = 10
y_preds, accuracy_10 = calc_accuracy(a, test_word_mat, start, end)
print("Accuracy of 10 records: %", accuracy_10)

Accuracy of 10 records: % 0.9


Prediction of a single name:

In [None]:
a = 0.001
start = 399
end = 400
y_preds, accuracy_1 = calc_accuracy(a, test_word_mat, start, end)

def find_country(num):
  country = ["arabic", "greek", "japan", "us"]
  return country[num]

print("Name: ", np.array(x_test)[start])
print("Actual country: ", find_country(int(y_test[start][0])))
print("Predicted country: ", find_country(y_preds[0]))

Name:  鈴木 千代
Actual country:  japan
Predicted country:  japan


Calculation of accuracy for the entire test data set.

In [None]:
a = 0.001
start = 0
end = len(y_test)
y_preds, accuracy = calc_accuracy(a, test_word_mat, start, end)
print("Accuracy of all the test records: %", accuracy)