In [62]:
import pandas as pd

def read_train_file(train_file_path, output_path):
    with open(train_file_path,'r') as f:
        lines = f.read().splitlines()

    word, tag = [], []
    for i in range(len(lines)):
        try : 
            x, y = lines[i].split(" ")
            word.append(x)
            tag.append(y)
        except ValueError: # if we have empty line
            word.append(" ")
            tag.append("<S>")

    data = {'word': word, 'tag': tag}
    df = pd.DataFrame(data)
    new_row = pd.DataFrame({'word':" ", 'tag': "<S>"}, index =[0]) #add start charachter at the first of the file
    df = pd.concat([new_row, df]).reset_index(drop = True)
    df.to_csv(output_path, index=False)
    return df

In [59]:
x = read_train_file('/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/Train.txt')
x.head()

Unnamed: 0,word,tag
0,,<S>
1,ميشوند,V
2,.,DELM
3,,<S>
4,نتيجهي,N


In [224]:
df.tag.value_counts()

N          109310
P           32580
ADJ         29832
DELM        25844
V           22327
CON         22094
<S>          8722
PRO          5714
DET          4130
ADV          3646
QUA          1820
AR           1175
IF            396
SPEC          350
MS            198
MORP          132
PP             85
MQUA           81
PS             31
DEFAULT        20
OH             12
NP             10
OHH             5
INT             2
Name: tag, dtype: int64

In [4]:
df.shape

(268516, 2)

In [215]:
tag_count

{'N': 109310,
 'P': 32580,
 'ADJ': 29832,
 'DELM': 25844,
 'V': 22327,
 'CON': 22094,
 '<S>': 8722,
 'PRO': 5714,
 'DET': 4130,
 'ADV': 3646,
 'QUA': 1820,
 'AR': 1175,
 'IF': 396,
 'SPEC': 350,
 'MS': 198,
 'MORP': 132,
 'PP': 85,
 'MQUA': 81,
 'PS': 31,
 'DEFAULT': 20,
 'OH': 12,
 'NP': 10,
 'OHH': 5,
 'INT': 2}

In [8]:
len(tag_count)

24

In [9]:
tag_seq = df.tag.to_list()
len(tag_seq)

268516

In [None]:
tag_count = df.tag.value_counts().to_dict()
all_tags = sorted(tag_count.keys())

In [97]:
import numpy as np
def create_transition_matrix(df, alpha): # df : tag_word_dataframe
    # transiont_count dictionary
    tag_count = df.tag.value_counts().to_dict()
    all_tags = sorted(tag_count.keys())
    tag_seq = df.tag.to_list()
    transition_count = {} 
    for i in all_tags:
        for j in all_tags:
            key = (i, j)
            transition_count[key] = 0

    for i in range(len(tag_seq)-1):
        transition_count[(tag_seq[i], tag_seq[i+1])] += 1

    # create transition matrix : bigram
    transition_matrix = np.zeros((len(all_tags), len(all_tags)))

    for i in range(transition_matrix.shape[0]):
        for j in range(transition_matrix.shape[1]):
            count = 0
            key = (all_tags[i], all_tags[j])
            if key in transition_count:
                count = transition_count[key]
            count_prev_tag = tag_count[all_tags[i]]
            transition_matrix[i][j] = (count + alpha) / (count_prev_tag + alpha* len(all_tags))
    transition_matrix = np.delete(transition_matrix, 0, axis=1) # eliminate column 1 correspondes to <S> tag       
    return transition_matrix        

In [13]:
x = create_transition_matrix(df)
x.shape

(24, 24)

In [90]:
# np.save('/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/transition_matrix.npy', transition_matrix)

In [26]:
tags_df1 = pd.DataFrame(x, columns = list(all_tags1), index=list(all_tags))
display(tags_df1)

Unnamed: 0,ADJ,ADV,AR,CON,DEFAULT,DELM,DET,IF,INT,MORP,...,NP,OH,OHH,P,PP,PRO,PS,QUA,SPEC,V
<S>,0.021096,0.038867,0.000344,0.104449,0.00149,0.082091,0.043797,0.012153,0.000115,0.0,...,0.0,0.0,0.000115,0.175533,0.000917,0.039555,0.000229,0.011121,0.001032,0.007108
ADJ,0.085981,0.011732,0.0,0.115581,3.4e-05,0.080853,0.006604,0.000503,0.0,0.000302,...,0.0,0.0,0.0,0.189528,0.000201,0.0178,0.0,0.003553,0.000402,0.233072
ADV,0.172518,0.02249,0.0,0.027153,0.0,0.041141,0.018925,0.005485,0.0,0.0,...,0.0,0.0,0.0,0.213933,0.000274,0.024136,0.0,0.007131,0.000549,0.065551
AR,0.001702,0.0,0.797447,0.012766,0.0,0.175319,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000851,0.0,0.000851,0.000851,0.0,0.0,0.001702
CON,0.093012,0.038744,0.000407,0.039966,0.0,0.012311,0.024305,0.006427,0.0,0.0,...,4.5e-05,9.1e-05,4.5e-05,0.183987,0.000498,0.024803,0.000136,0.017742,0.000996,0.026885
DEFAULT,0.0,0.05,0.0,0.05,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05,0.0,0.1,0.0,0.0,0.0,0.0
DELM,0.033315,0.018341,0.0077,0.057924,7.7e-05,0.043182,0.008513,0.002554,3.9e-05,3.9e-05,...,0.0,0.000271,7.7e-05,0.061407,0.00031,0.009325,0.000929,0.004721,0.00031,0.013581
DET,0.002906,0.0,0.0,0.012833,0.0,0.000969,0.000242,0.0,0.0,0.0,...,0.0,0.0,0.0,0.002906,0.001453,0.001211,0.0,0.000242,0.019128,0.008717
IF,0.055556,0.020202,0.0,0.010101,0.0,0.002525,0.042929,0.002525,0.0,0.0,...,0.0,0.0,0.0,0.136364,0.0,0.025253,0.0,0.022727,0.002525,0.060606
INT,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
tags_df = pd.DataFrame(transition_matrix, columns = list(all_tags), index=list(all_tags))
display(tags_df)

Unnamed: 0,<S>,ADJ,ADV,AR,CON,DEFAULT,DELM,DET,IF,INT,...,NP,OH,OHH,P,PP,PRO,PS,QUA,SPEC,V
<S>,0.0,0.021096,0.038867,0.000344,0.104449,0.00149,0.082091,0.043797,0.012153,0.000115,...,0.0,0.0,0.000115,0.175533,0.000917,0.039555,0.000229,0.011121,0.001032,0.007108
ADJ,0.0,0.085981,0.011732,0.0,0.115581,3.4e-05,0.080853,0.006604,0.000503,0.0,...,0.0,0.0,0.0,0.189528,0.000201,0.0178,0.0,0.003553,0.000402,0.233072
ADV,0.0,0.172518,0.02249,0.0,0.027153,0.0,0.041141,0.018925,0.005485,0.0,...,0.0,0.0,0.0,0.213933,0.000274,0.024136,0.0,0.007131,0.000549,0.065551
AR,0.0,0.001702,0.0,0.797447,0.012766,0.0,0.175319,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000851,0.0,0.000851,0.000851,0.0,0.0,0.001702
CON,0.0,0.093012,0.038744,0.000407,0.039966,0.0,0.012311,0.024305,0.006427,0.0,...,4.5e-05,9.1e-05,4.5e-05,0.183987,0.000498,0.024803,0.000136,0.017742,0.000996,0.026885
DEFAULT,0.0,0.0,0.05,0.0,0.05,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05,0.0,0.1,0.0,0.0,0.0,0.0
DELM,0.337448,0.033315,0.018341,0.0077,0.057924,7.7e-05,0.043182,0.008513,0.002554,3.9e-05,...,0.0,0.000271,7.7e-05,0.061407,0.00031,0.009325,0.000929,0.004721,0.00031,0.013581
DET,0.0,0.002906,0.0,0.0,0.012833,0.0,0.000969,0.000242,0.0,0.0,...,0.0,0.0,0.0,0.002906,0.001453,0.001211,0.0,0.000242,0.019128,0.008717
IF,0.0,0.055556,0.020202,0.0,0.010101,0.0,0.002525,0.042929,0.002525,0.0,...,0.0,0.0,0.0,0.136364,0.0,0.025253,0.0,0.022727,0.002525,0.060606
INT,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
import operator

def extract_vocab(df):
    word_count = df.word.value_counts().to_dict()
    sorted_word_count = dict(sorted(word_count.items(), key=operator.itemgetter(1),reverse=True))
    del sorted_word_count[" "] # delete blanck lines
    
    single_occurrence_words = [] # threshold
    for word in sorted_word_count:
        if sorted_word_count[word] == 1:
            single_occurrence_words.append(word)
    new_word_counts = {k:v for k,v in sorted_word_count.items() if v != 1}
    new_word_counts["UNK"] = len(single_occurrence_words)
    new_word_counts = dict(sorted(new_word_counts.items(), key=operator.itemgetter(1),reverse=True)) # keys are our vocabulary
    vocabs = sorted(new_word_counts.keys())
    return vocabs 

In [64]:
vocab_counts = extract_vocab(df)

In [70]:
# vocab_counts
vocabs = sorted(vocab_counts.keys())
len(vocabs)

10218

In [74]:
vocab = extract_vocab(df)
# vocab[0]
import pickle

with open("/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/vocabs", "wb") as f:   #Pickling
    pickle.dump(vocab, f)

In [75]:
with open("/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/vocabs", "rb") as f:   # Unpickling
    lexicon = pickle.load(f)

In [56]:
count_vocab = len(new_word_count)
tags = len(all_tags)
print(f'count vocab : {count_vocab}')
print(f'tags len : {tags}')

count vocab : 10218
tags len : 24


# observation matrix

In [95]:
def create_observation_matrix(df, alpha):
    vocabs = extract_vocab(df)
    tag_count = df.tag.value_counts().to_dict()
    all_tags = sorted(tag_count.keys())
    word_given_tag_count = {}
    for i in vocabs: # change name to vocab
        for j in all_tags:
                key = (i, j)
                word_given_tag_count[key] = 0

    for i in range(df.shape[0]):
        word, tag = df.word[i], df.tag[i]
        key = (word, tag)
        if word == " ": # for blank llines
            pass
        elif word not in vocabs:
            key = ("UNK", tag)
            word_given_tag_count[key] += 1
        else:
            word_given_tag_count[key] += 1  

    tag_count = df.tag.value_counts().to_dict()
    all_tags = sorted(tag_count.keys())
    observation_matrix = np.zeros((len(vocabs), len(all_tags))) 

    for i in range(observation_matrix.shape[0]):
        for j in range(observation_matrix.shape[1]):
            count = 0
            key = (vocabs[i], all_tags[j])
            if key in word_given_tag_count:
                count = word_given_tag_count[key]
            count_tag = tag_count[all_tags[j]]
            observation_matrix[i][j] = (count + alpha) / (count_tag + alpha*len(vocabs))   
    observation_matrix = np.delete(observation_matrix, 0, axis=1) 
    observation_matrix = observation_matrix.T
    return observation_matrix

In [39]:
x = create_observation_matrix(df, vocab_counts, 0)

In [54]:
__df = pd.DataFrame(observation_matrix, columns = list(all_tags1), index=list(vocabs))
display(__df)

Unnamed: 0,ADJ,ADV,AR,CON,DEFAULT,DELM,DET,IF,INT,MORP,...,NP,OH,OHH,P,PP,PRO,PS,QUA,SPEC,V
!,0.000000,0.0,0.0,0.0,0.00,0.002476,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
"""",0.000000,0.0,0.0,0.0,0.00,0.027086,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
#,0.000000,0.0,0.0,0.0,0.60,0.033741,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
%,0.000000,0.0,0.0,0.0,0.00,0.002670,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
(,0.000000,0.0,0.0,0.0,0.05,0.065508,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
‏در,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000092,0.0,0.0,0.0,0.0,0.0,0.0
‏مربوط,0.000067,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
“,0.000000,0.0,0.0,0.0,0.00,0.000426,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
”,0.000000,0.0,0.0,0.0,0.00,0.000464,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
# np.save('/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/observation_matrix.npy', observation_matrix)

elimination for both martixes
smoothing
calculate pi
viterbi algo
test data????

In [63]:
def initial_probabilities(transition_matrix):
    return transition_matrix[0, :]

In [99]:
def read_test_file(train_file_path):
    with open(train_file_path,'r') as f:
        lines = f.read().splitlines()

    words, tags = [], []
    for i in range(len(lines)):
        try : 
            x, y = lines[i].split(" ")
            words.append(x)
            tags.append(y)
        except ValueError: # if we have empty line
            pass
    return words, tags   

In [69]:
vocab

NameError: name 'vocab' is not defined

In [79]:
vocab.index("UNK")

376

In [86]:
def vocab_index_observv(o, t, vocab):
    if o[t] in vocab:
        return vocab.index(o[t])
    else:
        unk_index = vocab.index("UNK")
        return unk_index  

In [87]:

vocab_index_observv(o, 0, vocab)

376

In [90]:
x = np.zeros((2, 3))
x[1, 2]

0.0

In [88]:
x = float('-inf')
-1 > x

True

In [91]:
tag_count = df.tag.value_counts().to_dict()
all_tags = sorted(tag_count.keys())
all_tags.remove("<S>")

In [111]:
observation_matrix.shape

(23, 10218)

In [154]:
x = np.ones((3, 4))
y = x[0, :]
y.shape

(4,)

In [155]:
b = np.ones((4, 4))*2

In [156]:
m = y *  b[:, 0]
m.shape

(4,)

In [173]:
viterb = np.zeros((4, 4))
transition_matri = np.zeros((4, 4))
observation_matri = np.zeros((4, 4))

In [175]:
z = viterb[:, -1] * transition_matri[:, 0] * observation_matri[0,1]

In [176]:
np.argmax(z)

0

In [193]:
def viterbi(observ, observation_matrix, transition_matrix, vocab):
    pi = initial_probabilities(transition_matrix)
    transition_matrix = transition_matrix[1:, :]
    N, T = observation_matrix.shape[0], len(observ)
    viterbi = np.zeros((N, T))
    backpointer = np.zeros((N, T))
    viterbi[:, 0] = pi* observation_matrix[:, vocab_index_observv(observ, 0, vocab)]
    backpointer[:, 0] = 0
    for t in range(1, T):
        for s in range(N):
            value = viterbi[:, t-1] * transition_matrix[:, s] * observation_matrix[s, vocab_index_observv(observ, t, vocab)]
            viterbi[s, t] = np.max(value)
            backpointer[s, t] = np.argmax(value)
    bestpath = np.zeros(T)
    prob = np.max(viterbi[:, T-1])            
    last_state = np.argmax(viterbi[:, T-1])  
    bestpath[0] = last_state
    backtrack_index = 1
    for t in range(T-2, -1, -1):
        bestpath[backtrack_index] = backpointer[int(last_state), t]
        last_state = backpointer[int(last_state), t] 
        backtrack_index += 1

    # Flip the path array since we were backtracking    
    bestpath = np.flip(bestpath, axis=0)    
    return bestpath, prob    
                   

In [194]:
def viterbi_log(observ, observation_matrix, transition_matrix, vocab):
    pi = initial_probabilities(transition_matrix)
    transition_matrix = transition_matrix[1:, :]
    N, T = observation_matrix.shape[0], len(observ)
    viterbi = np.zeros((N, T))
    backpointer = np.zeros((N, T))
    viterbi[:, 0] = np.log(pi* observation_matrix[:, vocab_index_observv(observ, 0, vocab)])
    backpointer[:, 0] = 0
    for t in range(1, T):
        for s in range(N):
            value = viterbi[:, t-1] + np.log(transition_matrix[:, s]) + np.log(observation_matrix[s, vocab_index_observv(observ, t, vocab)])
            viterbi[s, t] = np.max(value)
            backpointer[s, t] = np.argmax(value)
    bestpath = np.zeros(T)
    prob = np.max(viterbi[:, T-1])            
    last_state = np.argmax(viterbi[:, T-1])  
    bestpath[0] = last_state
    backtrack_index = 1
    for t in range(T-2, -1, -1):
        bestpath[backtrack_index] = backpointer[int(last_state), t]
        last_state = backpointer[int(last_state), t] 
        backtrack_index += 1

    # Flip the path array since we were backtracking    
    bestpath = np.flip(bestpath, axis=0)    
    return bestpath, prob    
                   

In [146]:
df = read_train_file('/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/Train.txt', '/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/tag_word.csv')
transition_matrix = create_transition_matrix(df, alpha=0.01)
vocab = extract_vocab(df)
observation_matrix = create_observation_matrix(df, alpha=0.01)
observ, gold_tag = read_test_file('/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/Test.txt')

In [232]:
gold_tag[10:53]

['N',
 'N',
 'DELM',
 'P',
 'N',
 'DELM',
 'N',
 'DELM',
 'N',
 'DELM',
 'PRO',
 'V',
 'PRO',
 'V',
 'DELM',
 'DELM',
 'N',
 'P',
 'N',
 'PRO',
 'V',
 'DELM',
 'N',
 'QUA',
 'N',
 'N',
 'CON',
 'N',
 'P',
 'N',
 'ADJ',
 'V',
 'DELM',
 'PRO',
 'P',
 'P',
 'N',
 'N',
 'CON',
 'N',
 'N',
 'V',
 'DELM']

In [208]:
len(observ)

259794

In [160]:
observation_matrix.shape

(23, 10218)

In [184]:
path , prob = viterbi(observ, observation_matrix, transition_matrix, vocab)
# tags = [all_tags[i] for i in path] 


KeyboardInterrupt: 

In [105]:
observation_matrix.shape

(23, 10218)

In [147]:
O = observ[:10]
O

['هجري', 'شمسي', 'فوت', 'كرد', 'و', 'در', 'قم', 'دفن', 'شد', '.']

In [141]:
T = gold_tag[:10]

In [142]:
T

['ADJ', 'ADJ', 'N', 'V', 'CON', 'P', 'N', 'N', 'V', 'DELM']

In [190]:
path , prob = viterbi(O, observation_matrix, transition_matrix, vocab)

In [192]:
prob

8.017610903233123e-29

In [148]:
path , prob = viterbi(O, observation_matrix, transition_matrix, vocab)

best path pointer : 5
lst before loop : [5]
last idx : 5, t : 8
lst in loop : [5, 12]
last idx : 12, t : 7
lst in loop : [5, 12, 12]
last idx : 12, t : 6
lst in loop : [5, 12, 12, 16]
last idx : 16, t : 5
lst in loop : [5, 12, 12, 16, 3]
last idx : 3, t : 4
lst in loop : [5, 12, 12, 16, 3, 22]
last idx : 22, t : 3
lst in loop : [5, 12, 12, 16, 3, 22, 12]
last idx : 12, t : 2
lst in loop : [5, 12, 12, 16, 3, 22, 12, 12]
last idx : 12, t : 1
lst in loop : [5, 12, 12, 16, 3, 22, 12, 12, 0]
last idx : 0, t : 0
lst in loop : [5, 12, 12, 16, 3, 22, 12, 12, 0, 0]


In [168]:
path , prob = viterbi2(O, observation_matrix, transition_matrix, vocab)

In [169]:
path

array([ 0., 12., 12., 22.,  3., 16., 12., 12., 22.,  5.])

In [199]:
path = path.astype(np.int16)

In [201]:
import time
start_time = time.time()


In [205]:
print(f'wall time : {(time.time() - start_time)} seconds')

wall time : 46.90720868110657 seconds


In [206]:
BASE_PATH = '/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/'
with open(BASE_PATH + "result.txt", "w") as f:
  f.write(f'wall time : {(time.time() - start_time)} seconds\n')
  f.write(f'hello')

In [200]:
path

array([ 0,  0, 12, 12, 22,  3, 16, 12, 12,  5], dtype=int16)

In [178]:
prob

-64.69332721169258

In [180]:
from scipy.special import logsumexp
x = logsumexp(prob)

In [181]:
x

-64.69332721169258

In [149]:
prob

8.017610903233123e-29

In [150]:
tags = [all_tags[i] for i in path] 


In [153]:
T

['ADJ', 'ADJ', 'N', 'V', 'CON', 'P', 'N', 'N', 'V', 'DELM']

In [151]:
tags

['DELM', 'N', 'N', 'P', 'CON', 'V', 'N', 'N', 'ADJ', 'ADJ']

In [152]:
tags == T
true = 0
for i in range(len(tags)):
    if tags[i] == T[i]:
        true += 1

acc = true/ len(tags)
print(acc)

0.4


In [216]:
len(observ)

259794

In [218]:
observation_matrix[all_tags.index("N"), vocab.index("UNK")]

0.05029613704799594

In [223]:
for i in range(len(observation_matrix[:, vocab.index("UNK")])):
    tag = all_tags[i]
    print(f'P("UNK"|{tag}) => {observation_matrix[i, vocab.index("UNK")]}')

P("UNK"|ADJ) => 0.05645753449735386
P("UNK"|ADV) => 0.03682053690057575
P("UNK"|AR) => 0.3742698758201663
P("UNK"|CON) => 0.003289304736220377
P("UNK"|DEFAULT) => 0.03282042887542969
P("UNK"|DELM) => 0.00027017464613287965
P("UNK"|DET) => 0.0011837870789994753
P("UNK"|IF) => 0.002027379661969569
P("UNK"|INT) => 0.009694759070838933
P("UNK"|MORP) => 0.04274489708771031
P("UNK"|MQUA) => 0.010972813625941696
P("UNK"|MS) => 0.07332267306282897
P("UNK"|N) => 0.05029613704799594
P("UNK"|NP) => 0.00900338741308611
P("UNK"|OH) => 0.00884568225608688
P("UNK"|OHH) => 0.01875349878708714
P("UNK"|P) => 0.0004898694028366529
P("UNK"|PP) => 0.0374505823271717
P("UNK"|PRO) => 0.001721060902516772
P("UNK"|PS) => 0.015092356209641085
P("UNK"|QUA) => 0.004167143555754403
P("UNK"|SPEC) => 0.002233623778141448
P("UNK"|V) => 0.033439028979213685


In [220]:
len(observation_matrix[:, vocab.index("UNK")])

23

In [226]:
observ[12:55]

['،',
 'از',
 'داستان',
 '"',
 'ويلانالدوله',
 '"',
 'مجموعه',
 '"',
 'يكي',
 'بود',
 'يكي',
 'نبود',
 '"',
 '،',
 'نمونهاي',
 'از',
 'نثر',
 'وي',
 'است',
 ':',
 'ويلانالدوله',
 'هر',
 'روز',
 'صبح',
 'كه',
 'چشمش',
 'از',
 'خواب',
 'باز',
 'ميشود',
 '،',
 'خود',
 'را',
 'در',
 'خانه',
 'غير',
 'و',
 'رختخواب',
 'ناشناسي',
 'ميبيند',
 '.',
 'محض',
 'خالي']

In [207]:
from sklearn.metrics import classification_report
print(classification_report(T, tags))  #########

              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00         2
         CON       1.00      1.00      1.00         1
        DELM       0.00      0.00      0.00         1
           N       0.75      1.00      0.86         3
           P       0.00      0.00      0.00         1
           V       0.00      0.00      0.00         2

    accuracy                           0.40        10
   macro avg       0.29      0.33      0.31        10
weighted avg       0.33      0.40      0.36        10



In [135]:
all_tags

['ADJ',
 'ADV',
 'AR',
 'CON',
 'DEFAULT',
 'DELM',
 'DET',
 'IF',
 'INT',
 'MORP',
 'MQUA',
 'MS',
 'N',
 'NP',
 'OH',
 'OHH',
 'P',
 'PP',
 'PRO',
 'PS',
 'QUA',
 'SPEC',
 'V']

In [138]:
observation_matrix[all_tags.index("ADJ"), vocab.index("UNK")]

0.05665057656208099

In [140]:
observation_matrix[all_tags.index("P"), vocab.index("UNK")]

0.0004910988336402701

In [None]:
from sklearn

In [210]:
len(gold_tag)

259794

In [211]:
from utils import *
pred_tag = load_list('/mnt/DAE855F7E855D1FD/github_msc/NLP/HW3/predicted_tag')
len(pred_tag)

259794

In [213]:
pred_tag[:10]


['ADJ', 'ADJ', 'N', 'N', 'V', 'CON', 'P', 'N', 'N', 'V']

In [212]:
gold_tag[:10]

['ADJ', 'ADJ', 'N', 'V', 'CON', 'P', 'N', 'N', 'V', 'DELM']

In [214]:
print(classification_report(gold_tag, pred_tag))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         ADJ       0.13      0.69      0.21     32226
         ADV       0.01      0.00      0.00      3096
          AR       0.03      0.65      0.05        75
         CON       0.02      0.01      0.01     20781
     DEFAULT       0.00      0.00      0.00        21
        DELM       0.07      0.03      0.04     27934
         DET       0.00      0.00      0.00      3839
          IF       0.00      0.00      0.00       177
         INT       0.00      0.00      0.00         5
        MORP       0.00      0.00      0.00       176
        MQUA       0.00      0.00      0.00        37
          MS       0.00      0.00      0.00        62
           N       0.39      0.14      0.20    115385
          NP       0.00      0.00      0.00         0
          OH       0.00      0.00      0.00        10
           P       0.03      0.01      0.02     30790
          PP       0.00      0.00      0.00        56
         PRO       0.01    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
