In [1]:
import pandas as pd
import numpy as np

In [2]:
################### generate emission parameters ##############################
# function to load training set to generate emission table
def load_train(train_path):
    f = open(train_path, encoding="utf8")
    lines = []
    for line in f:
        if line != '\n':
            line = line.strip('\n').split(' ')
            lines.append(line)
    df= pd.DataFrame(lines, columns = ['word', 'state'])
    return df

# function to load dev.in to generate emission table
def load_test(test_path):
    f = open(test_path, encoding='utf8')
    lines = []
    for line in f:
        if line != '\n':       
            line = line.strip('\n')
            lines.append(line)
    df= pd.DataFrame(lines, columns = ['word'])
    return df


def gen_emission_param_table_UNK(training_data_path, test_data_path):
    training_data = load_train(training_data_path)
    unique_word_list_training = training_data.word.unique()
    unique_state_list_training = training_data.state.unique()
    
    
    
    test_data = load_test(test_data_path)
    unique_word_list_test = test_data.word.unique()
    
    unk_list = np.setdiff1d(unique_word_list_test, unique_word_list_training) # return the list of words in test data but not in training data
    #non_unk_list_test = np.setdiff1d(unique_word_list_test, unk_list) # return the list of non UNK words in test data
    
    data = {word:(np.zeros(len(unique_state_list_training))) for word in unique_word_list_training}
    data["UNK"] = np.zeros(len(unique_state_list_training))    # add a UNK column to the table
    
    emission_count_table = pd.DataFrame(data, index = unique_state_list_training)           # transform the dictionary into colums with index as name of each state(y),  
                                                                                   # columns as each word of x, all entries are 0
    
    y_count_dic = {state:0 for state in unique_state_list_training}                         # y_count_dic stores Count(y) in a dictionary
    emission_param_table = pd.DataFrame(data, index = unique_state_list_training)           # emission_count_table stores all Count(y -> x) in a dataframe
                                                                                   # emission_param_table stores all the emission parameters in a dataframe
    
    print("updating emission_count_table and y_count_dic")
    for index, row in training_data.iterrows():
        word = row['word']
        state = row['state']
        #print(index, word, state)
        #print(index)
        y_count_dic[state]+=1
        if word not in unk_list:
            emission_count_table[word][state] += 1
        
    
    print("updating emission_param_table")
    k = 0.5
    for index, row in training_data.iterrows():
        word = row['word']
        state = row['state']
        #print(index)
        if word not in unk_list:
            emission_param_table[word][state] = emission_count_table[word][state] / (y_count_dic[state] + k)    
    for state in unique_state_list_training:
        emission_param_table['UNK'][state] = k/(y_count_dic[state] + k)    # compute the UNK value for each state y

    
    print("unl_list is: ",unk_list)
    print("y_count_dic is: ", y_count_dic)
    return emission_param_table, unk_list

em_param_table_UNK , unk_list = gen_emission_param_table_UNK('SG/train', 'SG/dev.in')
print(unk_list)
em_param_table_UNK

updating emission_count_table and y_count_dic
updating emission_param_table
unl_list is:  ['#25-28' '#2503love' '#2k16' ... '的家家家' '老江' '金味']
y_count_dic is:  {'O': 178169, 'B-neutral': 24868, 'I-neutral': 28760, 'B-positive': 3672, 'I-positive': 2558, 'B-negative': 1139, 'I-negative': 501}
['#25-28' '#2503love' '#2k16' ... '的家家家' '老江' '金味']


Unnamed: 0,Welcome,lunch,for,Charlotte,and,Tin,2,.,#DellPinoys,#iworkfordell,...,https://t.co/1LOoAaD3m3,https://t.co/vOaV71uXX2,https://t.co/3JciGQSjPg,https://t.co/BpLrevmNT9,https://t.co/2OyQGc0up3,didn,https://t.co/NFlUTuKIP1,https://t.co/xlI7t51227,https://t.co/AI0MvxyfPX,UNK
O,0.000112,0.000292,0.008284,0.0,0.009715,6e-06,0.001369,0.023826,6e-06,6e-06,...,6e-06,6e-06,6e-06,6e-06,6e-06,6e-06,6e-06,6e-06,6e-06,3e-06
B-neutral,0.0,4e-05,0.0,4e-05,0.0,8e-05,4e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-05
I-neutral,0.0,0.0,0.000139,0.0,0.002017,0.0,0.003442,0.005528,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7e-05
B-positive,0.0,0.0,0.0,0.0,0.0,0.000272,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000136
I-positive,0.0,0.0,0.000391,0.0,0.005081,0.0,0.003518,0.011335,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000195
B-negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000439
I-negative,0.0,0.0,0.0,0.0,0.001994,0.0,0.001994,0.007976,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000997


In [8]:
################### generate transition parameters ##############################

# load_train here is different from part 2
# read the train data line by line, replace each '\n' with STOP and START
# so that it is clear when each sentence end and when each sentence started
def load_train_transition(train_path):
    f = open(train_path, encoding="utf8")
    lines = []
    # add a START to before first sentence
    lines.append(['START'])
    for line in f:
        if line == '\n':
            lines.append(['STOP'])
            lines.append(['START'])
        
        else:
            line = line.strip('\n').split(' ')
            del line[0]
            lines.append(line)
        
    df= pd.DataFrame(lines, columns = ['state'])
    return df

def create_transition_count_table(df_train):
    unique_state_list = df_train.state.unique()
    
    data = {state:(np.zeros(len(unique_state_list))) for state in unique_state_list}
    transition_count_table = pd.DataFrame(data, index = unique_state_list)   # there will one extra STOP row and a START column
    transition_count_table = transition_count_table.drop('STOP')  #drop the extra STOP column
    transition_count_table = transition_count_table.drop(columns=['START']) #drop the extra START column
    return transition_count_table

def create_y_count_dic(df_train):
    unique_state_list = df_train.state.unique()
    y_count_dic = {state:0 for state in unique_state_list}
    y_count_dic.pop('STOP', None)  # remove the extra STOP state since we are inclusing count(STOP) when computing the transition paraameter 
    return y_count_dic

def gen_transition_param_table(train_path):
    input_data = load_train_transition(train_path)
    #transition_count_table,  y_count_dic = create_TransitionCountTable_and_YCountDic(input_data)
    transition_count_table = create_transition_count_table(input_data)
    transition_param_table = transition_count_table.copy(deep=True) # create a empty transition_param_table and store the final value of all the transition parameters
    y_count_dic = create_y_count_dic(input_data)
    print('Generating transition parameter table')
    for i in range(len(input_data) -1):          # len(input_data) -1 coz we iterating from y_i = 0 to y_i = n-1
        y_i = input_data['state'][i]
        y_i_p1 = input_data['state'][i+1]        # y_i_p1 stands for yi_+1 (y_i plus 1)
        if y_i != 'STOP':                        # we do not count the transition from STOP to some other state
            transition_count_table[y_i_p1][y_i] += 1
            y_count_dic[y_i] += 1
            
    cols_list = transition_count_table.columns.values.tolist()      
    index_list = transition_count_table.index.values.tolist()
    for index in index_list:
        for col in cols_list:
            transition_param_table[col][index] = transition_count_table[col][index]/y_count_dic[index]  # a(y_i, y_i+1) = Count(y_i, y_i+1) / Count(y_i)
    return transition_param_table

transition_param_table = gen_transition_param_table('SG/train')
transition_param_table

Generating transition parameter table


Unnamed: 0,O,B-neutral,I-neutral,STOP,B-positive,I-positive,B-negative,I-negative
START,0.899671,0.06148,0.0,0.0,0.029743,0.0,0.009106,0.0
O,0.746185,0.130477,0.0,0.10096,0.017062,0.0,0.005315,0.0
B-neutral,0.481703,0.013672,0.493285,0.010656,0.000483,0.0,0.000201,0.0
I-neutral,0.418115,0.00386,0.57347,0.004416,0.000104,0.0,3.5e-05,0.0
B-positive,0.58061,0.003813,0.0,0.020425,0.01634,0.378813,0.0,0.0
I-positive,0.519937,0.005082,0.0,0.01681,0.001955,0.456216,0.0,0.0
B-negative,0.666374,0.000878,0.0,0.042142,0.0,0.0,0.014047,0.276558
I-negative,0.598802,0.001996,0.0,0.025948,0.0,0.0,0.001996,0.371257


In [3]:
# load dev.in, store each sentence as a n x 1 dataframe , each word is a row of the 'word' column
# return a list storing all such columns
def split_test_sentence(train_path):
    f = open(train_path, encoding="utf8")
    lines = []
    # add a START to before first sentence
    lines.append(['START'])
    for line in f:
        if line == '\n':
            lines.append(['STOP'])
            lines.append(['START'])
        
        else:
            line = line.strip('\n').split(' ')
            lines.append(line)
        
    df= pd.DataFrame(lines, columns = ['word'])
    
    sentence_list = []   # a list of all sentences
    sentence = []        # a list storing a sentence
    for index, row in df.iterrows():
        word = row['word']
        if word != 'STOP':
            sentence.append(word)
        else:
            sentence.append(word)
            df_sentence= pd.DataFrame(sentence, columns = ['word'])
            sentence_list.append(df_sentence)
            sentence = []
            
    return sentence_list
sentence_list = split_test_sentence('SG/dev.in')
for i in range(5):
    print('##########################################')
    display(sentence_list[i])

##########################################


Unnamed: 0,word
0,START
1,Everything
2,sounds
3,better
4,with
5,the
6,Titanic
7,song
8,laa
9,.


##########################################


Unnamed: 0,word
0,START
1,Eat
2,time
3,(
4,@
5,Geylang
6,East
7,Market
8,&
9,Food


##########################################


Unnamed: 0,word
0,START
1,Kinda
2,excited
3,about
4,the
5,Godzilla-Kong
6,film
7,series
8,STOP


##########################################


Unnamed: 0,word
0,START
1,I'm
2,at
3,Chinatown
4,Street
5,Market
6,in
7,Singapore
8,https://t.co/ldW8BH3M8l
9,STOP


##########################################


Unnamed: 0,word
0,START
1,I'm
2,at
3,Junction
4,8
5,Shopping
6,Centre
7,in
8,Singapore
9,https://t.co/xBScjEWswb


In [6]:
sentence = sentence_list[0]
sentence

Unnamed: 0,word
0,START
1,Everything
2,sounds
3,better
4,with
5,the
6,Titanic
7,song
8,laa
9,.


AttributeError: 'DataFrame' object has no attribute 'rows'

In [4]:
# create the 2-D pi(j,u) table, initialize all values as 0 at beginning
# input 
# sentence_df: n x 1 dataframe , each word is a row of the 'word' column
# state_list : a list storing all the states except START and STOP
def create_pi_table(sentence_df, state_list):
    word_list = 
    for index,row in sentence_df.iterrows():
        
    
    return df    

df = create_pi_table('SG/train')
df

Unnamed: 0,word
0,START
1,Everything
2,sounds
3,better
4,with
5,the
6,Titanic
7,song
8,laa
9,.
