In [1]:
import os
import pandas as pd
from time import perf_counter
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('data/ner_dataset.csv', encoding='latin1')
data.fillna(method='ffill', inplace=True)
data['Sentence #'] = data['Sentence #'].apply(lambda x: str(x).replace('Sentence: ', ''))
data['Sentence #'] = LabelEncoder().fit_transform(data['Sentence #'])
data.rename(columns={'Sentence #': 'sentence_id', 'Word':'words','Tag':'labels'},inplace=True)
data['labels'] = data['labels'].str.upper()
data

Unnamed: 0,sentence_id,words,POS,labels
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O
...,...,...,...,...
1048570,42177,they,PRP,O
1048571,42177,responded,VBD,O
1048572,42177,to,TO,O
1048573,42177,the,DT,O


In [3]:
df = pd.read_excel('./data/MetaData.xlsx')
df

Unnamed: 0,Country,Division,Region,Job_Family,Training_Provider,Generation,Ratings,Gender,Date
0,Albania,Division A,APAC,Finance,Coursera,20-30,Very Bad,Male,2004-5-14
1,Algeria,Division B,EMEA,Audit,Tedx,30-40,Bad,Female,2009-10-25
2,Argentina,Division C,NCSA,Strategy Planing,Udacity,40-55,Average,,2008-12-27
3,Australia,Division D,,General,Udemy,55-70,Good,,2009-2-9
4,Austria,Division E,,Communications,Linkedin Learning,70-90,Very Good,,1988-6-23
...,...,...,...,...,...,...,...,...,...
49995,,,,,,,,,2008-10-1
49996,,,,,,,,,1998-07-17 00:00:00
49997,,,,,,,,,1984-11-23
49998,,,,,,,,,1984-11-23


In [4]:
date = df['Date'].dropna().values.tolist()
gender = df['Gender'].dropna().values.tolist()
ratings = df['Ratings'].dropna().values.tolist()
generation = df['Generation'].dropna().values.tolist()
jf = df['Job_Family'].dropna().values.tolist()
tp = df['Training_Provider'].dropna().values.tolist()
region = df['Region'].dropna().values.tolist()
div = df['Division'].dropna().values.tolist()
cn = df['Country'].dropna().values.tolist()

In [5]:
df.dtypes

Country              object
Division             object
Region               object
Job_Family           object
Training_Provider    object
Generation           object
Ratings              object
Gender               object
Date                 object
dtype: object

In [6]:
date_new = [str(d).split(' ') for i, d in enumerate(date)]
gender_new = [str(d).split(' ') for i, d in enumerate(gender)]
ratings_new = [str(d).split(' ') for i, d in enumerate(ratings)]
generation_new = [str(d).split(' ') for i, d in enumerate(generation)]
jf_new = [str(d).split(' ') for i, d in enumerate(jf)]
tp_new = [str(d).split(' ') for i, d in enumerate(tp)]
region_new = [str(d).split(' ') for i, d in enumerate(region)]
div_new = [str(d).split(' ') for i, d in enumerate(div)]
cn_new = [str(d).split(' ') for i, d in enumerate(cn)]

In [7]:
data.drop(['POS'], axis=1, inplace=True)
data.to_csv('./data/0.csv', index=False)
data

Unnamed: 0,sentence_id,words,labels
0,0,Thousands,O
1,0,of,O
2,0,demonstrators,O
3,0,have,O
4,0,marched,O
...,...,...,...
1048570,42177,they,O
1048571,42177,responded,O
1048572,42177,to,O
1048573,42177,the,O


In [8]:
def update_data(df, col:list, label:str, sub_label:str, filename:str, verbose:bool=True):
    e, itr, rows, done = 0, 0, len(col), 0
    if verbose:
        print(f'DataFrame Shape: {df.shape}. Col Size: {rows}.')
    for x in col:
        if type(x) == list:
            for i, z in enumerate(x):
                new_data = dict()
                prev_id = 0
                if i == 0:
                    if df.shape[0] > 0:
                        prev_id = df['sentence_id'][df.shape[0]-1]
                    new_data['sentence_id'] = prev_id + 1
                    new_data['words'] = z
                    new_data['labels'] = label.upper()
                    df = df.append(new_data, ignore_index=True)
                elif i > 0:
                    new_data['sentence_id'] = df['sentence_id'][df.shape[0]-1]
                    new_data['words'] = z
                    new_data['labels'] = sub_label.upper()
                    df = df.append(new_data, ignore_index=True)
                itr += 1
        else:
            e += 1
        done += 1
        if verbose:
            print(f'Iterations: {itr}s. Done: {done}. {done/rows*100:.2f}%', end='\r')
        else:
            print(f'{done/row*100:.2f}%')
    if verbose:
        print(f'Error: {e} instances                                         ', end='\r')
    if filename.endswith('.csv'):
        df.to_csv(filename, index=False)
    elif filename.endswith('.xls') or filename.endswith('.xlsx'):
        df.to_excel(filename, index=False)
    return df

In [9]:
%%time
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), date_new, 'B-tim', 'I-tim', './data/1.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), gender_new, 'B-gen', 'I-gen', './data/2.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), ratings_new, 'B-rat', 'I-rat', './data/3.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), generation_new, 'B-gen', 'I-gen', './data/4.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), jf_new, 'B-jf', 'I-jf', './data/5.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), tp_new, 'B-tp', 'I-tp', './data/6.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), region_new, 'B-rgn', 'I-rgn', './data/7.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), div_new, 'B-div', 'I-div', './data/8.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')
s = perf_counter()
_ = update_data(pd.DataFrame(columns=['sentence_id','words','labels']), cn_new, 'B-cn', 'I-cn', './data/9.csv')
del _
e = perf_counter()
print(f'Time Taken: {e-s:.2f} seconds                                                      ')

DataFrame Shape: (0, 3). Col Size: 50000.
Time Taken: 456.13 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 2.
Time Taken: 0.01 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 8.
Time Taken: 0.06 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 6.
Time Taken: 0.04 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 16.
Time Taken: 0.06 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 12.
Time Taken: 0.05 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 3.
Time Taken: 0.01 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 7.
Time Taken: 0.04 seconds                                                      
DataFrame Shape: (0, 3). Col Size: 97.
Time Taken: 0.82 