### File read-in

In [20]:
names = []
nationality = []

In [24]:
arabic_names = []
with open('arabic.txt', 'r', encoding = "utf8") as f:
    for line in f:
        an = f.readline().replace('\n', '')
        arabic_names.append(an)
        names.append(an)
        nationality.append('Arabic')
len(set(arabic_names))

500

In [25]:
greek_names = []
with open('greek.txt', 'r', encoding = "utf8") as f:
    for line in f:
        gn = f.readline().replace('\n', '')
        greek_names.append(gn)
        names.append(gn)
        nationality.append('Greek')
len(set(greek_names))

500

In [26]:
japanese_names = []
with open('japan.txt', 'r', encoding = "utf8") as f:
    for line in f:
        jn = f.readline().replace('\n', '')
        japanese_names.append(jn)
        names.append(jn)
        nationality.append('Japanese')
len(set(japanese_names))

439

In [27]:
us_names = []
with open('us.txt', 'r', encoding = "utf8") as f:
    for line in f:
        un = f.readline().replace('\n', '')
        us_names.append(un)
        names.append(un)
        nationality.append('American')
len(set(us_names))

500

### Combining names with nationalities

In [28]:
import pandas as pd
import numpy as np

people_data = pd.DataFrame(list(zip(names, nationality)), columns = ['Name', 'Nationality'])
people_data.sample(n = 10)

Unnamed: 0,Name,Nationality
1728,Eric Wilson,American
513,Ήβη Πίσπα,Greek
1502,Nancy Johns,American
756,Περιστέρα Προύβα,Greek
22,المهندسة جهراء ارناؤوط,Arabic
882,Αδάμ Κρεμμύδας,Greek
397,وفاء بنو شعبة,Arabic
1036,佐々木 直人,Japanese
1089,前田 七夏,Japanese
1442,佐藤 治,Japanese


### Vectorizing the inputs

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

X = people_data['Name']
Y = people_data['Nationality']

vec = CountVectorizer()
X_vec = vec.fit_transform(X)
X_vec.shape

(2000, 2061)

In [30]:
X_vec = X_vec.todense()
X_features = pd.DataFrame(X_vec)
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2051,2052,2053,2054,2055,2056,2057,2058,2059,2060
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
max_el = []
for x in X_features.columns:
    sx = set(X_features[x])
    max_el.append(sx)
len(max_el)

2061

### Splitting into training and testing

In [42]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_features, Y, stratify = Y, test_size = 0.3, shuffle = True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((1400, 2061), (1400,), (600, 2061), (600,))

### Calculating likelihoods

In [43]:
import itertools
import math
from collections import Counter


def likelihood(X, Y, L):

    log_likelihood_dict = {}
    classes = set(Y)

    for fn in X.columns:
        f_dict = {}
        f = set(X[fn])
        l = list(itertools.product(f, classes))
        for item in l:
            f_dict[item] = 0
        t = [(a, b) for a, b in zip(X[fn], Y)]
        tc = dict(Counter(t))
        for d in list(f_dict.keys()):
            if d in list(tc.keys()):
                f_dict[d] = math.log(tc[d]/len(Y), 10)
        log_likelihood_dict[fn] = f_dict

    return log_likelihood_dict

likelihood_probability = likelihood(x_train, y_train, max_el)

### Calculating prior probabilities of each class

In [44]:
def prior_y(C, Y):

    label_count = 0
    for y in Y:
        if y == C:
            label_count += 1
            
    return label_count / len(Y)

prior_class_probability = {}
for y in set(Y):
    prior_class_probability[y] = math.log(prior_y(y, y_train), 10)

### Testing for one sample

In [45]:
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [46]:
test_sample = x_test[1]
test_class = y_test[1]

In [47]:
def make_preds(x_in):

    pred_scores = {}

    for c in set(Y):
        score = 0
        for x in range(len(x_in)):
            attrib_val = x_in[x]
            lpx = likelihood_probability[x]
            #print(likelihood_probability[x], x)
            if (attrib_val, c) in list(lpx.keys()):
                pval = lpx[(attrib_val, c)]
            else:
                pval = 0
            #print((attrib_val, c), "-> ", pval)
            score += pval
        score += prior_class_probability[c]
        pred_scores[score] = c

    final_pred = pred_scores[min(list(pred_scores.keys()))]
    return final_pred

In [48]:
test_pred = make_preds(test_sample)
test_pred

'American'

In [49]:
test_class

'American'

### Testing for all

In [50]:
y_pred = []

for i in x_test:
    y_pred.append(make_preds(i))

In [51]:
acc_list = y_pred == y_test
accuracy = (acc_list.sum()/len(acc_list)) * 100
accuracy

86.16666666666667