# Preliminaries

In [1]:
import re
import operator
import numpy as np

## Loading data in

In [2]:
EN_full = (open("./data/EN.txt","r+")).read() #these are strings
FR_full = (open("./data/FR.txt","r+")).read() 
GR_full = (open("./data/GR.txt","r+")).read() 

lang_id_g = (open("./data/LangID.gold.txt","r+")).readlines() # these are lists
lang_id_t = (open("./data/LangID.test.txt","r+")).readlines()

lang_id_labels = []  # actual test set labels
lang_id_test_letter = []    # test set
regex = re.compile('[^a-zA-Z]')

EN_train_letter = (regex.sub('', EN_full)).lower()
FR_train_letter = (regex.sub('', FR_full)).lower()
GR_train_letter = (regex.sub('', GR_full)).lower()

EN_train_word = (re.sub('[^A-z0-9 -]', ' ', EN_full)).lower().replace('  ', ' ')
FR_train_word = (re.sub('[^A-z0-9 -]', ' ', FR_full)).lower().replace('  ', ' ')
GR_train_word = (re.sub('[^A-z0-9 -]', ' ', GR_full)).lower().replace('  ', ' ')

# regex.sub('', 'ab3d*E')
for i in range(len(lang_id_g)):
    if i == 0:
        continue
    l = regex.sub('', lang_id_g[i])
    lang_id_labels.append(l)
for i in range(len(lang_id_t)):
    s = regex.sub('', lang_id_t[i])
    lang_id_test_letter.append(s.lower())
    
lang_id_test_word = []
for s in lang_id_t:
    s = re.sub('[^A-z0-9 -]', '', s)
    s = re.sub(r'[0-9]+', '', s)
    s = (s.lstrip()).rstrip().lower()
    lang_id_test_word.append(s)

# Question 7

In [3]:
def convert_bigram(text):
    letter_bigram = []
    for i in range(len(text)-n+1):
        letter_bigram.append(text[i:i+n])
    return letter_bigram

In [4]:
n = 2
letter_bigram_en = convert_bigram(EN_train_letter)
letter_bigram_fr = convert_bigram(FR_train_letter)
letter_bigram_gr = convert_bigram(GR_train_letter)

p(c1 | c0) = count(c0+c1) / count(c0)

In [6]:
def calc_probabilities_letter(data):
    counts = {}
    probs = {}
    for let_bigram in data:
        c0 = let_bigram[0]
        c1 = let_bigram[1]
        if let_bigram not in counts:
            counts[let_bigram] = 0
        if c0 not in counts:
            counts[c0] = 0
        if c1 not in counts:
            counts[c1] = 0
        counts[c0] += 1
        counts[c1] += 1
        counts[let_bigram] += 1
    for let_bigram in data:
        probs[let_bigram] = counts[let_bigram]/counts[let_bigram[0]]
    return probs

In [7]:
letter_bi_probs_en = calc_probabilities_letter(letter_bigram_en)
letter_bi_probs_fr = calc_probabilities_letter(letter_bigram_fr)
letter_bi_probs_gr = calc_probabilities_letter(letter_bigram_gr)

In [8]:
letter_bi_probs_en

{'el': 0.02465339739190117,
 'ls': 0.02582534611288605,
 'si': 0.04260009532888465,
 'it': 0.0593594058256934,
 'th': 0.1996068046862462,
 'he': 0.2550204918032787,
 'eu': 0.0017295813315030884,
 'un': 0.0611764705882353,
 'nu': 0.004250968575118382,
 'us': 0.06143790849673202,
 'su': 0.019184938036224976,
 'ua': 0.011895424836601307,
 'al': 0.034953234982161796,
 'lf': 0.010117145899893504,
 'fa': 0.05341246290801187,
 'ar': 0.05168257641500337,
 'rm': 0.015839556733317273,
 'mm': 0.00982256020278834,
 'ma': 0.09188846641318124,
 'ai': 0.021164786423681418,
 'id': 0.027039572937217128,
 'dj': 0.0015291688967046411,
 'je': 0.11513157894736842,
 'er': 0.07330130404941661,
 're': 0.12039267646350277,
 'em': 0.016554564172958135,
 'mi': 0.05307351077313054,
 'ia': 0.00800742717883254,
 'as': 0.05679298042618841,
 'sg': 0.002859866539561487,
 'go': 0.053286658887592375,
 'ot': 0.03893338740748251,
 'tt': 0.028847696998876585,
 'fs': 0.006923837784371909,
 'sw': 0.021151096282173498,
 'wi':

In [9]:
def test(test, en_probs, fr_probs, gr_probs, bigram_method):
    langs = []
    for sent in test:
        bigram = bigram_method(sent)
        en_ps = []
        fr_ps = []
        gr_ps = []
        for b in bigram:
            try:
                en_ps.append(en_probs[b])
            except:
                pass
            try:
                fr_ps.append(fr_probs[b])
            except:
                pass
            try:
                gr_ps.append(gr_probs[b])
            except:
                pass
        if len(en_ps) == 0:
            en_p = 0
        else:
            en_p = np.prod(en_ps)
        if len(fr_ps) == 0:
            fr_p = 0
        else:
            fr_p = np.prod(fr_ps)
        if len(gr_ps) == 0:
            gr_p = 0
        else:
            gr_p = np.prod(gr_ps)
        p_list = {"EN": en_p, "FR": fr_p, "GR": gr_p}
        p_list = sorted(p_list.items(), key=operator.itemgetter(1), reverse=True)
        langs.append(p_list[0][0])
    return langs

In [10]:
def calc_accuracy(labels, results):
    pos = 0
    for l,r in zip(labels, results):
        if l == r:
            pos += 1
    return (pos/len(labels)) * 100.0

In [11]:
test_results = test(lang_id_test_letter, letter_bi_probs_en, letter_bi_probs_fr, letter_bi_probs_gr, convert_bigram)

In [12]:
def output_lang(results):
    i = 1
    print("ID\tLANG")
    for r in results:
        print(str(i) + ".\t" + r)
        i += 1

In [13]:
calc_accuracy(lang_id_labels, test_results)

89.33333333333333

In [14]:
output_lang(test_results)

ID	LANG
1.	EN
2.	GR
3.	FR
4.	EN
5.	EN
6.	FR
7.	FR
8.	FR
9.	EN
10.	GR
11.	EN
12.	GR
13.	GR
14.	FR
15.	FR
16.	EN
17.	EN
18.	EN
19.	FR
20.	GR
21.	GR
22.	GR
23.	GR
24.	EN
25.	FR
26.	FR
27.	FR
28.	EN
29.	EN
30.	FR
31.	GR
32.	FR
33.	FR
34.	GR
35.	GR
36.	GR
37.	GR
38.	FR
39.	FR
40.	GR
41.	GR
42.	EN
43.	FR
44.	GR
45.	GR
46.	FR
47.	GR
48.	GR
49.	GR
50.	GR
51.	GR
52.	GR
53.	FR
54.	FR
55.	GR
56.	EN
57.	FR
58.	EN
59.	GR
60.	EN
61.	GR
62.	FR
63.	EN
64.	EN
65.	FR
66.	EN
67.	GR
68.	EN
69.	FR
70.	GR
71.	GR
72.	GR
73.	GR
74.	GR
75.	GR
76.	EN
77.	EN
78.	FR
79.	FR
80.	FR
81.	FR
82.	FR
83.	FR
84.	FR
85.	FR
86.	GR
87.	FR
88.	FR
89.	FR
90.	FR
91.	FR
92.	FR
93.	FR
94.	FR
95.	FR
96.	FR
97.	FR
98.	EN
99.	EN
100.	FR
101.	FR
102.	GR
103.	GR
104.	EN
105.	EN
106.	EN
107.	GR
108.	FR
109.	FR
110.	FR
111.	EN
112.	EN
113.	EN
114.	EN
115.	FR
116.	GR
117.	EN
118.	GR
119.	GR
120.	FR
121.	EN
122.	GR
123.	FR
124.	GR
125.	GR
126.	EN
127.	FR
128.	EN
129.	EN
130.	FR
131.	EN
132.	EN
133.	EN
134.	EN
135.	EN
136.	FR
137.	FR
138.

# Question 8

In [15]:
def convert_word_bigram(data):
    bigrams = []
    arr = data.split()
    for i in range(len(arr) - 1):# zip(arr, arr[1:]):
        bigrams.append((arr[i], arr[i + 1]))
    return bigrams

In [16]:
word_bigram_en = convert_word_bigram(EN_train_word)
word_bigram_fr = convert_word_bigram(FR_train_word)
word_bigram_gr = convert_word_bigram(GR_train_word)

### Probabilities + Add-One (Laplace) Smoothing

p(w1 | w0) = count(w0+c1) + 1 / count(w0) + V

V = count for all words in the vocabulary

In [17]:
def calc_probabilities_word(data):
    counts = {}
    probs = {}
    V = 0
    words = {}
    for w_bigram in data:
        w0 = w_bigram[0]
        w1 = w_bigram[1]
        if w_bigram not in counts:
            counts[w_bigram] = 0
        if w0 not in words:
            words[w0] = 0
        if w1 not in words:
            words[w1] = 0

        counts[w_bigram] += 1
        words[w0] += 1
        words[w1] += 1
    V = len(words)
    max_v = 0.0
    gr = None
    for w_bigram in data:
        probs[w_bigram] = (counts[w_bigram] + 1) / float(words[w_bigram[0]] + V)
        if probs[w_bigram] > max_v:
            max_v = probs[w_bigram]
            gr = w_bigram
    print(gr, max_v)
    return probs

In [18]:
word_bi_probs_en = calc_probabilities_word(word_bigram_en)
word_bi_probs_fr = calc_probabilities_word(word_bigram_fr)
word_bi_probs_gr = calc_probabilities_word(word_bigram_gr)

('of', 'the') 0.045192958700067704
('de', 'la') 0.029160739687055477
('in', 'der') 0.009577015163607342


In [19]:
word_bi_probs_gr

{('elsi', 'die'): 0.0003475842891901286,
 ('die', 'seltsame'): 0.00027766208524226016,
 ('seltsame', 'magd'): 0.00035887313834559486,
 ('magd', 'jeremias'): 0.0003577177606868181,
 ('jeremias', 'gotthelf'): 0.0003591309032142216,
 ('gotthelf', 'reich'): 0.0003591309032142216,
 ('reich', 'an'): 0.00035835871707579287,
 ('an', 'sch'): 0.0003383522246658772,
 ('sch', 'nen'): 0.005684754521963824,
 ('nen', 't'): 0.00035366931918656057,
 ('t', 'lern'): 0.0006979584714709475,
 ('lern', 'ist'): 0.00035887313834559486,
 ('ist', 'die'): 0.0012050266827336891,
 ('die', 'schweiz'): 0.00027766208524226016,
 ('schweiz', 'wer'): 0.0003591309032142216,
 ('wer', 'z'): 0.0003546728143287817,
 ('z', 'hlte'): 0.0005352363960749331,
 ('hlte', 'sie'): 0.0012511170688114387,
 ('sie', 'wohl'): 0.0003023431594860166,
 ('wohl', 'auf'): 0.0003550505947097461,
 ('auf', '-'): 0.00033129037601457677,
 ('-', 'in'): 0.00035810205908683976,
 ('in', 'keinem'): 0.0003192338387869114,
 ('keinem', 'lehrbuch'): 0.00035861

In [20]:
word_bi_probs_en[('good', 'reason')]

0.000679040289723857

In [21]:
test_word = test(lang_id_test_word, word_bi_probs_en, word_bi_probs_fr, word_bi_probs_gr, convert_word_bigram)

In [22]:
calc_accuracy(lang_id_labels, test_word)

84.66666666666667

In [23]:
output_lang(test_word)

ID	LANG
1.	EN
2.	EN
3.	EN
4.	EN
5.	EN
6.	FR
7.	FR
8.	FR
9.	FR
10.	GR
11.	FR
12.	EN
13.	GR
14.	EN
15.	EN
16.	EN
17.	EN
18.	EN
19.	FR
20.	GR
21.	GR
22.	EN
23.	GR
24.	EN
25.	FR
26.	FR
27.	FR
28.	EN
29.	FR
30.	FR
31.	GR
32.	FR
33.	FR
34.	GR
35.	EN
36.	GR
37.	GR
38.	FR
39.	FR
40.	GR
41.	EN
42.	EN
43.	FR
44.	GR
45.	GR
46.	FR
47.	GR
48.	GR
49.	GR
50.	GR
51.	GR
52.	GR
53.	FR
54.	FR
55.	GR
56.	EN
57.	EN
58.	GR
59.	GR
60.	EN
61.	EN
62.	EN
63.	EN
64.	EN
65.	FR
66.	EN
67.	GR
68.	EN
69.	FR
70.	GR
71.	GR
72.	GR
73.	GR
74.	EN
75.	GR
76.	FR
77.	FR
78.	EN
79.	EN
80.	FR
81.	EN
82.	FR
83.	FR
84.	FR
85.	FR
86.	FR
87.	FR
88.	FR
89.	FR
90.	FR
91.	EN
92.	FR
93.	FR
94.	FR
95.	FR
96.	FR
97.	FR
98.	EN
99.	EN
100.	FR
101.	FR
102.	GR
103.	GR
104.	EN
105.	EN
106.	EN
107.	EN
108.	FR
109.	EN
110.	FR
111.	EN
112.	EN
113.	FR
114.	EN
115.	FR
116.	GR
117.	EN
118.	GR
119.	GR
120.	FR
121.	EN
122.	EN
123.	FR
124.	GR
125.	GR
126.	EN
127.	EN
128.	EN
129.	EN
130.	EN
131.	FR
132.	EN
133.	EN
134.	EN
135.	EN
136.	FR
137.	FR
138.

# Question 9

### Probabilities + Good Turing Smoothing

Very good explanation: https://www.youtube.com/watch?v=GwP8gKa-ij8

In [206]:
def turing_probs(counts, ns):
    N = sum(counts.values())
    turing = {}
    for c in counts:
        count = counts[c]
        try:
            NC1 = ns[count + 1]
        except:
            NC1 = 0
        NC = ns[count]
        turing[c] = (((count + 1) * NC1)/NC)
    return turing

In [207]:
def calc_probabilities_word_gt(data):
    counts = {}
    for w_bigram in data:
        if w_bigram not in counts:
            counts[w_bigram] = 0
        counts[w_bigram] += 1
    return counts

In [208]:
def count_ns(data):
    ns = {}
    for bigram, count in data.items():
        if count not in ns:
            ns[count] = 0
        ns[count] += 1
    return ns

In [209]:
word_gt_probs_en = calc_probabilities_word_gt(word_bigram_en)
word_gt_probs_fr = calc_probabilities_word_gt(word_bigram_fr)
word_gt_probs_gr = calc_probabilities_word_gt(word_bigram_gr)

In [210]:
word_gt_probs_fr

{('honor', 'de'): 1,
 ('de', 'balzac'): 1,
 ('balzac', 'la'): 1,
 ('la', 'fille'): 4,
 ('fille', 'aux'): 4,
 ('aux', 'yeux'): 12,
 ('yeux', 'd'): 12,
 ('d', 'or'): 9,
 ('or', '1834-35'): 1,
 ('1834-35', 'chapitre'): 1,
 ('chapitre', 'physionomies'): 1,
 ('physionomies', 'parisiennes'): 1,
 ('parisiennes', 'un'): 1,
 ('un', 'des'): 8,
 ('des', 'spectacles'): 1,
 ('spectacles', 'o'): 1,
 ('o', 'se'): 6,
 ('se', 'rencontre'): 4,
 ('rencontre', 'le'): 1,
 ('le', 'plus'): 17,
 ('plus', 'd'): 5,
 ('d', 'pouvantement'): 1,
 ('pouvantement', 'est'): 1,
 ('est', 'certes'): 1,
 ('certes', 'l'): 1,
 ('l', 'aspect'): 2,
 ('aspect', 'g'): 1,
 ('g', 'n'): 53,
 ('n', 'ral'): 23,
 ('ral', 'de'): 5,
 ('de', 'la'): 286,
 ('la', 'population'): 6,
 ('population', 'parisienne'): 2,
 ('parisienne', 'peuple'): 1,
 ('peuple', 'horrible'): 1,
 ('horrible', 'voir'): 1,
 ('voir', 'h'): 1,
 ('h', 've'): 1,
 ('ve', 'jaune'): 1,
 ('jaune', 'tann'): 1,
 ('tann', 'paris'): 1,
 ('paris', 'n'): 2,
 ('n', 'est-il'): 4,


In [211]:
ns_en = count_ns(word_gt_probs_en)
ns_fr = count_ns(word_gt_probs_fr)
ns_gr = count_ns(word_gt_probs_gr)

In [212]:
turing_en = turing_probs(word_gt_probs_en, ns_en)
turing_fr = turing_probs(word_gt_probs_fr, ns_fr)
turing_gr = turing_probs(word_gt_probs_gr, ns_gr)

In [219]:
def gt_unseen_calc(counts, ns):
    N = sum(counts.values())
    N1 = ns[1]
    p_unseen = N1/N
    print(p_unseen)
    return p_unseen

In [220]:
en_unseen = gt_unseen_calc(word_gt_probs_en, ns_en)
fr_unseen = gt_unseen_calc(word_gt_probs_fr, ns_fr)
gr_unseen = gt_unseen_calc(word_gt_probs_gr, ns_gr)

0.5386257073565077
0.5282321626999509
0.6366572466693581


In [221]:
turing_en

{('elsi', 'the'): 0.984779299847793,
 ('the', 'unusual'): 0.23665726121150268,
 ('unusual', 'farm'): 0.23665726121150268,
 ('farm', 'maid'): 2.8459119496855347,
 ('maid', 'jeremias'): 0.23665726121150268,
 ('jeremias', 'gotthelf'): 0.23665726121150268,
 ('gotthelf', 'switzerland'): 0.23665726121150268,
 ('switzerland', 'is'): 0.23665726121150268,
 ('is', 'so'): 1.9659969088098919,
 ('so', 'bountifully'): 0.23665726121150268,
 ('bountifully', 'blessed'): 0.23665726121150268,
 ('blessed', 'with'): 0.23665726121150268,
 ('with', 'beautiful'): 0.23665726121150268,
 ('beautiful', 'valleys'): 0.23665726121150268,
 ('valleys', 'that'): 0.23665726121150268,
 ('that', 'no'): 3.4806629834254146,
 ('no', 'one'): 0.0,
 ('one', 'could'): 8.076923076923077,
 ('could', 'possibly'): 0.23665726121150268,
 ('possibly', 'name'): 0.23665726121150268,
 ('name', 'them'): 0.23665726121150268,
 ('them', 'all'): 0.23665726121150268,
 ('all', 'no'): 0.23665726121150268,
 ('no', 'schoolbook'): 0.2366572612115026

In [222]:
def test_gturing(test, en_probs, en_unseen, fr_probs, fr_unseen, gr_probs, gr_unseen):
    langs = []
    for sent in test:
        bigram = convert_word_bigram(sent)
        en_ps = []
        fr_ps = []
        gr_ps = []
        for b in bigram:
            try:
                en_ps.append(en_probs[b])
            except:
                en_ps.append(en_unseen)
            try:
                fr_ps.append(fr_probs[b])
            except:
                fr_ps.append(fr_unseen)
            try:
                gr_ps.append(gr_probs[b])
            except:
                gr_ps.append(gr_unseen)
        en_p = np.prod(en_ps)
        fr_p = np.prod(fr_ps)
        gr_p = np.prod(gr_ps)
        p_list = {"EN": en_p, "FR": fr_p, "GR": gr_p}
        p_list = sorted(p_list.items(), key=operator.itemgetter(1), reverse=True)
        print(p_list)
        langs.append(p_list[0][0])
    return langs

In [223]:
test_gt_t = test_gturing(lang_id_test_word, turing_en, en_unseen, turing_fr, fr_unseen, turing_gr, gr_unseen)

[('EN', 0.5439051949889717), ('GR', 0.16429439480947974), ('FR', 0.077857304336227)]
[('GR', 0.40533244973660787), ('EN', 0.2901176526252983), ('FR', 0.27902921771066735)]
[('GR', 0.06659384952611985), ('EN', 0.024418695801653335), ('FR', 0.021724462721998773)]
[('EN', 0.4037921388612982), ('GR', 0.0001197053979781182), ('FR', 2.8608613799859766e-06)]
[('GR', 0.017185065061561332), ('EN', 0.006976466780634762), ('FR', 0.0032020165092606057)]
[('GR', 0.000463870414913334), ('FR', 6.565663160993683e-05), ('EN', 2.7032133058087e-05)]
[('FR', 7.874155570768634e-05), ('GR', 1.3699320807785903e-07), ('EN', 3.9359331213358743e-10)]
[('FR', 0.06090588269503242), ('GR', 0.0001880217316371584), ('EN', 7.842498988266927e-06)]
[('GR', 5.5527792627475455e-08), ('FR', 1.0229230275333399e-10), ('EN', 0.0)]
[('GR', 0.0018697517295481545), ('EN', 0.0005962727046536822), ('FR', 0.00047195228055951435)]
[('GR', 5.308624117602565e-07), ('FR', 5.667060202388414e-10), ('EN', 0.0)]
[('GR', 0.0044347407947074

In [224]:
calc_accuracy(lang_id_labels, test_gt_t)

56.666666666666664

In [156]:
output_lang(test_gt_t)

ID	LANG
1.	EN
2.	GR
3.	GR
4.	EN
5.	GR
6.	GR
7.	FR
8.	FR
9.	GR
10.	GR
11.	GR
12.	GR
13.	GR
14.	GR
15.	EN
16.	EN
17.	EN
18.	GR
19.	GR
20.	GR
21.	GR
22.	GR
23.	GR
24.	EN
25.	FR
26.	FR
27.	GR
28.	GR
29.	GR
30.	FR
31.	GR
32.	GR
33.	GR
34.	GR
35.	GR
36.	GR
37.	GR
38.	GR
39.	GR
40.	GR
41.	GR
42.	GR
43.	FR
44.	GR
45.	GR
46.	FR
47.	GR
48.	GR
49.	GR
50.	EN
51.	EN
52.	GR
53.	FR
54.	FR
55.	GR
56.	EN
57.	GR
58.	GR
59.	GR
60.	GR
61.	GR
62.	GR
63.	EN
64.	GR
65.	GR
66.	EN
67.	GR
68.	GR
69.	FR
70.	GR
71.	GR
72.	GR
73.	GR
74.	GR
75.	GR
76.	EN
77.	EN
78.	GR
79.	EN
80.	GR
81.	GR
82.	GR
83.	GR
84.	GR
85.	GR
86.	FR
87.	FR
88.	GR
89.	GR
90.	GR
91.	GR
92.	GR
93.	GR
94.	GR
95.	GR
96.	GR
97.	FR
98.	GR
99.	EN
100.	GR
101.	GR
102.	GR
103.	GR
104.	GR
105.	GR
106.	EN
107.	GR
108.	GR
109.	GR
110.	FR
111.	GR
112.	GR
113.	GR
114.	EN
115.	GR
116.	EN
117.	EN
118.	GR
119.	GR
120.	FR
121.	GR
122.	GR
123.	FR
124.	GR
125.	GR
126.	EN
127.	GR
128.	EN
129.	GR
130.	GR
131.	GR
132.	EN
133.	EN
134.	GR
135.	GR
136.	GR
137.	GR
138.

# Question 10

In [106]:
def convert_word_trigram(data):
    bigrams = []
    arr = data.split()
    for i in range(len(arr) - 2):# zip(arr, arr[1:]):
        bigrams.append((arr[i], arr[i + 1], arr[i + 2]))
    return bigrams

In [107]:
word_trigram_en = convert_word_trigram(EN_train_word)
word_trigram_fr = convert_word_trigram(FR_train_word)
word_trigram_gr = convert_word_trigram(GR_train_word)