In [3]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm, trange
import random

In [4]:
df = pd.read_csv('./data/final/dataset.csv', encoding="latin1")
df.head(100)

Unnamed: 0,File_id,Word,Tag
0,105,Name,O
1,105,:,O
2,105,Tan,B-NAME
3,105,Gang,I-NAME
4,105,Lun,I-NAME
...,...,...,...
95,105,2010,O
96,105,CIP,O
97,105,Gold,O
98,105,Award,O


In [5]:
from sklearn.utils import shuffle

class DocumentGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.documents = self.data.groupby("File_id").apply(agg_func)
        
        
        self.train_size = 0.8
        self.test_size = 0.1
        self.dev_size = 0.1

    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
    def split_docs(self):
        df = shuffle(self.documents)
        
        train_size = round(0.8 * len(df))
        self.tr_documents = df[:train_size]
                           
        test_size = round(train_size + (0.1 * len(df)))
        self.te_documents = df[train_size:test_size]
        
        self.de_documents = df[test_size:len(df)]
        
    def dict_to_df(self, d):
        ids = []
        tokens = []
        tags = []
        for key in d:
            doc = d[key]
            
            for (token, tag) in doc:
                ids.append(key)
                tokens.append(token)
                tags.append(tag)
        
        data = {
            'File_id': ids,
            'Word': tokens,
            'Tag': tags
        }
        
        return pd.DataFrame(data)
    def create_files(self):
        
        df_tr = self.dict_to_df(dict(self.tr_documents))
        df_te = self.dict_to_df(dict(self.te_documents))
        df_de = self.dict_to_df(dict(self.de_documents))
        
        
        print('Train', len(Counter(df_tr['Tag'].values)))
        print('Test', len(Counter(df_te['Tag'].values)), Counter(df_te['Tag'].values))
        print('Dev', len(Counter(df_de['Tag'].values)), Counter(df_de['Tag'].values))
        df_tr.to_csv('./data/final/train.csv', index=False)
        df_te.to_csv('./data/final/test.csv', index=False)
        df_de.to_csv('./data/final/dev.csv', index=False)

In [6]:
doc_Getter = DocumentGetter(df)
doc_Getter.split_docs()
doc_Getter.create_files()

Train 16
Test 14 Counter({'O': 17394, 'I-ADDRESS': 67, 'I-PHONE': 33, 'I-NAME': 30, 'B-NAME': 24, 'B-PHONE': 21, 'B-EMAIL': 19, 'B-ADDRESS': 11, 'I-DOB': 10, 'B-NATIONALITY': 8, 'B-DOB': 5, 'B-GENDER': 5, 'B-AGE': 3, 'B-ETHNICITY': 3})
Dev 15 Counter({'O': 16394, 'I-ADDRESS': 79, 'I-PHONE': 32, 'I-NAME': 23, 'B-NAME': 21, 'B-PHONE': 20, 'B-EMAIL': 20, 'B-ADDRESS': 14, 'I-DOB': 11, 'B-DOB': 7, 'B-NATIONALITY': 7, 'B-AGE': 4, 'B-GENDER': 3, 'B-RELIGION': 1, 'B-ETHNICITY': 1})


In [7]:
def create_file(t, s_len, n_dataset):
    f_path = './data/final/' + t + '.csv'
    d = prepare_data(f_path, s_len)
    getter = SentenceGetter(d)
    sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
    labels = [[s[1] for s in sentence] for sentence in getter.sentences]
    
    c = list(zip(sentences, labels))
    random.shuffle(c)
    sentences, labels = zip(*c)
    
    SIZE = 1
    
    split = round(len(sentences)*SIZE)
    print(len(sentences), len(labels))
    sentences = sentences[:split]
    labels = labels[:split]
    
    print(len(sentences), len(labels))
    create_txt_file(sentences, labels, t, s_len, n_dataset)
    
    
def prepare_data(file_path, s_len):
    # load dataframe
    df = pd.read_csv(file_path, encoding="latin1")
    
    # drop nan
    df = df.dropna()
    
    padding_length = 4
    sentence_length = s_len
    splits = split_in_sentences(sentence_length, padding_length, df)
    output = filter_splits(splits, False)
    output['Word'] = output['Word']
    output = output.dropna()
    return output


def filter_splits(d, f=True):
    if f:
        split_tag = d.groupby('Split #')['Tag'].apply(list)
        to_remove = []
        for key, value in split_tag.items():
            tags_in_split = list(set(value))
            if len(tags_in_split) == 1 and tags_in_split[0] == 'O':
                d = d[d['Split #'] != key]
        
    return d



def split_in_sentences(sen_len, pad_len, d):
    doc_words = d.groupby('File_id')['Word'].apply(list)
    doc_tags = d.groupby('File_id')['Tag'].apply(list)
    
    splits = []
    cur_counter = 0
    
    for i in range(len(doc_words)):
        cur_doc = list(doc_words)[i]
        cur_tags = list(doc_tags)[i]
        cur_counter += 1
        for j in range(len(cur_doc)):
            splits.append('split ' + str(cur_counter))
            if ((j % sen_len) == 0 and j != 0):
                    
                cur_counter += 1
                
    d['Split #'] = splits
    return d

def create_txt_file(s, l, name, s_len, n_dataset):
    if (len(s) != len(l)):
        print('Not the same length')
        return
    
    f = open('./data/final/s_' +str(s_len)+ '/' + n_dataset + '/' + name +'.txt',"w+")
    for i in range(len(s)):
        sentence = s[i]
        labels = l[i]
        for j in range(len(sentence)):
            
            if (is_ascii(sentence[j])):
                f.write(sentence[j].strip() + ' ' + labels[j] +'\n')            
        f.write('\n')
    f.close()
    
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [8]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Split #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

# create txt files


In [9]:
SENTENCE_LENGTHS = [25, 50, 75, 100, 150, 200]

In [12]:
for i in range(1,6):
    doc_Getter = DocumentGetter(df)
    doc_Getter.split_docs()
    doc_Getter.create_files()
    for sl in SENTENCE_LENGTHS:
        create_file('train', sl, str(i))
        create_file('test', sl, str(i))
        create_file('dev', sl, str(i))
    

Train 16
Test 14 Counter({'O': 15864, 'I-ADDRESS': 58, 'I-PHONE': 53, 'I-NAME': 26, 'B-NAME': 23, 'B-PHONE': 23, 'B-EMAIL': 21, 'I-DOB': 14, 'B-ADDRESS': 10, 'B-NATIONALITY': 9, 'B-DOB': 6, 'B-GENDER': 5, 'B-AGE': 2, 'B-ETHNICITY': 1})
Dev 15 Counter({'O': 11928, 'I-ADDRESS': 75, 'I-PHONE': 59, 'I-NAME': 22, 'B-NAME': 20, 'B-PHONE': 20, 'B-EMAIL': 18, 'B-ADDRESS': 12, 'B-DOB': 6, 'I-DOB': 6, 'B-NATIONALITY': 5, 'B-AGE': 3, 'B-GENDER': 3, 'B-ETHNICITY': 2, 'B-RELIGION': 1})
5340 5340
5340 5340
654 654
654 654
496 496
496 496
2713 2713
2713 2713
332 332
332 332
253 253
253 253
1833 1833
1833 1833
225 225
225 225
172 172
172 172
1398 1398
1398 1398
172 172
172 172
131 131
131 131
956 956
956 956
117 117
117 117
92 92
92 92
739 739
739 739
90 90
90 90
70 70
70 70
Train 16
Test 14 Counter({'O': 13471, 'I-ADDRESS': 58, 'I-PHONE': 52, 'I-NAME': 22, 'B-PHONE': 22, 'B-NAME': 21, 'B-EMAIL': 20, 'B-ADDRESS': 10, 'I-DOB': 6, 'B-DOB': 5, 'B-NATIONALITY': 5, 'B-GENDER': 2, 'B-RELIGION': 1, 'B-AGE': 

In [31]:
create_file('train', SENTENCE_LENGTH)

715 715
715 715


In [32]:
create_file('test', SENTENCE_LENGTH)

97 97
97 97


In [33]:
create_file('dev', SENTENCE_LENGTH)

87 87
87 87
