In [None]:
# the following file contains code for creating sub files of the dataset

In [None]:
#import
import re
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm, trange
import random

In [None]:
# load dataset
df = pd.read_csv('./data/final/dataset.csv', encoding="latin1")
df.head(100)

In [None]:
# class to group the documents
from sklearn.utils import shuffle

class DocumentGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.documents = self.data.groupby("File_id").apply(agg_func)
        
        
        self.train_size = 0.8
        self.test_size = 0.1
        self.dev_size = 0.1
    
    # get next document
    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
    
    # split documents
    def split_docs(self):
        df = shuffle(self.documents)
        
        train_size = round(0.8 * len(df))
        self.tr_documents = df[:train_size]
                           
        test_size = round(train_size + (0.1 * len(df)))
        self.te_documents = df[train_size:test_size]
        
        self.de_documents = df[test_size:len(df)]
    
    # convert a dict to a df
    def dict_to_df(self, d):
        ids = []
        tokens = []
        tags = []
        for key in d:
            doc = d[key]
            
            for (token, tag) in doc:
                ids.append(key)
                tokens.append(token)
                tags.append(tag)
        
        data = {
            'File_id': ids,
            'Word': tokens,
            'Tag': tags
        }
        
        return pd.DataFrame(data)
    
    # create the train dev test csv's
    def create_files(self):
        
        df_tr = self.dict_to_df(dict(self.tr_documents))
        df_te = self.dict_to_df(dict(self.te_documents))
        df_de = self.dict_to_df(dict(self.de_documents))
        
        
        print('Train', len(Counter(df_tr['Tag'].values)))
        print('Test', len(Counter(df_te['Tag'].values)), Counter(df_te['Tag'].values))
        print('Dev', len(Counter(df_de['Tag'].values)), Counter(df_de['Tag'].values))
        df_tr.to_csv('./data/final/train.csv', index=False)
        df_te.to_csv('./data/final/test.csv', index=False)
        df_de.to_csv('./data/final/dev.csv', index=False)

In [None]:
# create the train test and dev files
doc_Getter = DocumentGetter(df)
doc_Getter.split_docs()
doc_Getter.create_files()

In [None]:
# create the .txt file
def create_file(t, s_len, n_dataset):
    f_path = './data/final/' + t + '.csv'
    d = prepare_data(f_path, s_len)
    getter = SentenceGetter(d)
    sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
    labels = [[s[1] for s in sentence] for sentence in getter.sentences]
    
    c = list(zip(sentences, labels))
    random.shuffle(c)
    sentences, labels = zip(*c)
    
    SIZE = 1
    
    split = round(len(sentences)*SIZE)
    print(len(sentences), len(labels))
    sentences = sentences[:split]
    labels = labels[:split]
    
    print(len(sentences), len(labels))
    create_txt_file(sentences, labels, t, s_len, n_dataset)
    
# iniit the variables and prepare the data    
def prepare_data(file_path, s_len):
    # load dataframe
    df = pd.read_csv(file_path, encoding="latin1")
    
    # drop nan
    df = df.dropna()
    
    padding_length = 4
    sentence_length = s_len
    splits = split_in_sentences(sentence_length, padding_length, df)
    output = filter_splits(splits, False)
    output['Word'] = output['Word']
    output = output.dropna()
    return output

# filter the sentence splits
def filter_splits(d, f=True):
    if f:
        split_tag = d.groupby('Split #')['Tag'].apply(list)
        to_remove = []
        for key, value in split_tag.items():
            tags_in_split = list(set(value))
            if len(tags_in_split) == 1 and tags_in_split[0] == 'O':
                d = d[d['Split #'] != key]
        
    return d


# split each document in sentences of length s
def split_in_sentences(sen_len, pad_len, d):
    doc_words = d.groupby('File_id')['Word'].apply(list)
    doc_tags = d.groupby('File_id')['Tag'].apply(list)
    
    splits = []
    cur_counter = 0
    
    for i in range(len(doc_words)):
        cur_doc = list(doc_words)[i]
        cur_tags = list(doc_tags)[i]
        cur_counter += 1
        for j in range(len(cur_doc)):
            splits.append('split ' + str(cur_counter))
            if ((j % sen_len) == 0 and j != 0):
                    
                cur_counter += 1
                
    d['Split #'] = splits
    return d

# function to create .txt fiile
def create_txt_file(s, l, name, s_len, n_dataset):
    if (len(s) != len(l)):
        print('Not the same length')
        return
    
    f = open('./data/final/s_' +str(s_len)+ '/' + n_dataset + '/' + name +'.txt',"w+")
    for i in range(len(s)):
        sentence = s[i]
        labels = l[i]
        for j in range(len(sentence)):
            
            if (is_ascii(sentence[j])):
                f.write(sentence[j].strip() + ' ' + labels[j] +'\n')            
        f.write('\n')
    f.close()

# check if token is ascii
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [None]:
# class for grouping the sentences in the document
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Split #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

# create txt files


In [None]:
SENTENCE_LENGTHS = [25, 50, 75, 100, 150, 200]

In [None]:
for i in range(1,6):
    doc_Getter = DocumentGetter(df)
    doc_Getter.split_docs()
    doc_Getter.create_files()
    for sl in SENTENCE_LENGTHS:
        create_file('train', sl, str(i))
        create_file('test', sl, str(i))
        create_file('dev', sl, str(i))
    

In [None]:
create_file('train', SENTENCE_LENGTH)

In [None]:
create_file('test', SENTENCE_LENGTH)

In [None]:
create_file('dev', SENTENCE_LENGTH)