## Importing required libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [None]:
from google.colab import drive
# Account : amritanshuks11@gmail.com
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
%cd /content/drive/MyDrive/nlpprojectdata/

/content/drive/MyDrive/nlpprojectdata


## 1. Loading Dataset

In [None]:
df_lang_token = pd.read_csv('nlpdataset.csv')
df_lang_token.head()

Unnamed: 0,names,language
0,Абзагә,Abkhaz
1,Адамыр,Abkhaz
2,Ажьгьери,Abkhaz
3,Аҟәлангьери,Abkhaz
4,Алиас,Abkhaz


In [None]:
df_lang_token['language'].value_counts()

Arabic              1332
japanese            1000
english              999
greek                999
Ancient Greek        270
Albanian             179
Afrikaans             33
Alemannic German      19
Abkhaz                12
Afar                   8
Akan                   2
Adyghe                 1
Name: language, dtype: int64

Here, we can observe tsample count of all the labels in the dataset

## Preprocessing

In [None]:
df_lang_token = df_lang_token.fillna(' ')

In [None]:
vect = CountVectorizer()
vec_names = vect.fit_transform(df_lang_token['names'])
vec_names_array = np.array(vec_names.todense())
vec_names_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Implementing CountVectorizer as a preprocessing step, we get vectorized form of input names as part of a sparse matrix

## 3. Splitting the dataset using train_test_split

In [None]:
X_vecinput = vec_names_array
Y_labels = df_lang_token['language']
x_train, x_test, y_train, y_test= train_test_split(X_vecinput,Y_labels, test_size = 0.3, shuffle=True, random_state= 0)

In [None]:
x_train.shape

(3397, 3542)

In [None]:
x_test.shape

(1457, 3542)

The split has been performed based on the 70(taining):30(testing) ratio with shuffle = true. We have alse set random_state in this split to avoid getting different train and test sets and keep the shuffling process in control for repeated excution of same dataset.

## 4. Implementing NB classifier from scratch

In [None]:
class NB:

  ''' with initialization, we have also defined parameter smoothing 
      which will be later used to perform laplace smoothing '''

  def __init__(self, smoothing=1):
    self.smoothing = smoothing

  # function to perform prediction on test data
  def prediction(self, xtest1):
    return [self.prediction_helper(xtest2) for xtest2 in xtest1] 

  # fit function to fit the respective model onto training data
  def fit(self, xtrain, ytrain):

    x, y = xtrain.shape
    self.labels = np.unique(ytrain)
    n_labels = len(self.labels)

    # initializing Likelihood and Prior probibilities
    self.prior_probab = np.zeros(n_labels)
    self.probab_likelehood = np.zeros((n_labels, y))

    # Calculating Likelihood and Prior probibilities
    for prob, z in enumerate(self.labels):
        xtrain_count = xtrain[z == ytrain]
        self.prior_probab[prob] = xtrain_count.shape[0] / x 
        # Performing Laplace Smoothing
        self.probab_likelehood[prob, :] = ((xtrain_count.sum(axis=0)) + self.smoothing) / (np.sum(xtrain_count.sum(axis=0) + self.smoothing))
  

  def likelihood_probab_cal(self, likelihood_clss, xtest2):
    return xtest2 * np.log(likelihood_clss)

  # Helper fuction for prediction function
  def prediction_helper(self, xtest2):
    # Calculating posterior-probability for every label
    poster_prob = []
    for prob, z in enumerate(self.labels):
        prob_prior = np.log(self.prior_probab[prob])
        prob_likelihood = self.likelihood_probab_cal(self.probab_likelehood[prob,:], xtest2)
        poster_prob_calc = np.sum(prob_likelihood) + prob_prior
        poster_prob.append(poster_prob_calc)

    return self.labels[np.argmax(poster_prob)]

  #Calculating the accuracy
  def test_accuracy(self, xtest1, ytest):

    ypred_accuracy = self.prediction(xtest1)
    return np.sum(ypred_accuracy == ytest)/len(ytest)

Here we have implemented multimonial naive bayes algorithm to achieve the desired classification.

In [None]:
model2 = NB()

In [None]:
model2.fit(x_train,y_train)

In [None]:
ypred1 = model2.prediction(x_test)

In [None]:
ypred1

['Arabic',
 'greek',
 'greek',
 'Arabic',
 'greek',
 'japanese',
 'Arabic',
 'Arabic',
 'Arabic',
 'japanese',
 'Arabic',
 'japanese',
 'english',
 'greek',
 'Arabic',
 'japanese',
 'japanese',
 'english',
 'japanese',
 'greek',
 'Arabic',
 'Arabic',
 'english',
 'Arabic',
 'greek',
 'japanese',
 'Arabic',
 'Arabic',
 'japanese',
 'greek',
 'greek',
 'Arabic',
 'japanese',
 'Arabic',
 'japanese',
 'english',
 'japanese',
 'Arabic',
 'japanese',
 'Arabic',
 'Arabic',
 'japanese',
 'Arabic',
 'japanese',
 'Arabic',
 'english',
 'Arabic',
 'Arabic',
 'Arabic',
 'greek',
 'greek',
 'greek',
 'Arabic',
 'Arabic',
 'japanese',
 'Arabic',
 'greek',
 'Arabic',
 'Arabic',
 'english',
 'Arabic',
 'Arabic',
 'japanese',
 'english',
 'Arabic',
 'japanese',
 'greek',
 'Arabic',
 'Arabic',
 'greek',
 'Arabic',
 'japanese',
 'greek',
 'japanese',
 'Arabic',
 'Arabic',
 'Arabic',
 'Arabic',
 'english',
 'Arabic',
 'greek',
 'Arabic',
 'english',
 'Arabic',
 'Arabic',
 'greek',
 'japanese',
 'Arabic',


In [None]:
model2.test_accuracy(x_test,y_test)

0.8442004118050789

created the naive bayes classifier from scratch, applied it on the test dataset after the performing feature extraction using countvectorization and achieved a score of 
84.4%

## the code and concept for this notebook has been referenced from:
https://stackoverflow.com/questions/60969884/multinomial-naive-bayes-for-python-from-scratch