In [1]:
## Imports
import pandas as pd
import numpy as np
import os

In [2]:
corpora_path = os.path.realpath('es-corpora/train/')

In [3]:
# In this dataframe an new sentence is preceded by a row where the columns values is NaN except the first sentence
df = pd.read_csv(corpora_path + '/spanish_tagged_with_features_03.csv', low_memory=False)
df.head(15)

Unnamed: 0,token,lemma,pos_tag,sense,tag,emission,transition,category,type
0,Arequipa,arequipa,NP00000,0.0,NP,Arequipa|NP,NP|,N,P
1,es,ser,VSIP3S0,1775973.0,VS,es|VS,VS|NP,V,S
2,el,el,DA0MS0,0.0,DA,el|DA,DA|VS,D,A
3,nombre,nombre,NCMS000,4778525.0,NC,nombre|NC,NC|DA,N,C
4,de,de,SPS00,0.0,SP,de|SP,SP|NC,S,P
5,el,el,DA0MS0,0.0,DA,el|DA,DA|SP,D,A
6,asteroide,asteroide,NCMS000,6755791.0,NC,asteroide|NC,NC|DA,N,C
7,número,número,NCMS000,3990394.0,NC,número|NC,NC|NC,N,C
8,737,737,Z,0.0,Z,737|Z,Z|NC,Z,
9,de,de,SPS00,0.0,SP,de|SP,SP|Z,S,P


In [4]:
# First, lets get the initial state indexes, the index were each sentence starts
init_state_indexes = np.array(df.loc[pd.isna(df['token'])].index.values)
init_state_indexes

array([     13,      71,      74, ..., 5032545, 5032549, 5032557])

In [5]:
# Now lets create a list of numpy array, those arrays will have the tokens of the sentences 
token_sentences_array = []
token_sentences_array.append(df['token'][:init_state_indexes[0]].to_numpy()) 
i = 0
j = i + 1

while j < init_state_indexes.shape[0]:
    token_sentences_array.append(df['token'][init_state_indexes[i]+1:init_state_indexes[j]].to_numpy())
    i += 1
    j += 1
token_sentences_array[:10]

[array(['Arequipa', 'es', 'el', 'nombre', 'de', 'el', 'asteroide',
        'número', '737', 'de', 'la', 'serie', '.'], dtype=object),
 array(['Conocido', 'previamente', 'con', 'la', 'designación',
        'provisional', '1912', 'QB', 'y', 'la', 'adicional', '1961', 'JH',
        'el', 'asteroide', '(', '737', ')', 'fue', 'descubierto', 'en',
        'Winchester', 'de', '7 de diciembre de 1912', 'por',
        'J.H. Metcalf', 'y', 'bautizado', 'en', 'honor', 'de', 'la',
        'ciudad', 'peruana', 'en', 'la', 'cual', 'funcionó', 'de', '1889',
        'a', '1927', 'el', 'Observatorio Boyden', ',', 'una',
        'instalación', 'de', 'observación', 'de', 'el', 'hemisferio',
        'sur', 'de', 'el', 'Harvard College Observatory', '.'],
       dtype=object),
 array(['ENDOFARTICLE', '.'], dtype=object),
 array(['La', 'provincia', 'de', 'Teherán', '(', 'Farsi', ':', ')', 'es',
        'una', 'de', 'las', '30', 'provincias', 'de', 'Irán', ',', 'tiene',
        'cerca', 'de', '12 millones', 

In [6]:
# Now lets do the same as above, but this time for for the tags 
tag_sentences_array = []
tag_sentences_array.append(df['tag'][:init_state_indexes[0]].to_numpy()) 
i = 0
j = i + 1

while j < init_state_indexes.shape[0]:
    tag_sentences_array.append(df['tag'][init_state_indexes[i]+1:init_state_indexes[j]].to_numpy())
    i += 1
    j += 1
tag_sentences_array[:10]

[array(['NP', 'VS', 'DA', 'NC', 'SP', 'DA', 'NC', 'NC', 'Z', 'SP', 'DA',
        'NC', 'Fp'], dtype=object),
 array(['VM', 'RG', 'SP', 'DA', 'NC', 'AQ', 'Z', 'NP', 'CC', 'DA', 'AQ',
        'Z', 'NP', 'DA', 'NC', 'Fp', 'Z', 'Fp', 'VS', 'VM', 'SP', 'NP',
        'SP', 'W', 'SP', 'NP', 'CC', 'VM', 'SP', 'NC', 'SP', 'DA', 'NC',
        'AQ', 'SP', 'DA', 'PR', 'VM', 'SP', 'Z', 'SP', 'Z', 'DA', 'NP',
        'Fc', 'DI', 'NC', 'SP', 'NC', 'SP', 'DA', 'NC', 'NC', 'SP', 'DA',
        'NP', 'Fp'], dtype=object),
 array(['NP', 'Fp'], dtype=object),
 array(['DA', 'NC', 'SP', 'NP', 'Fp', 'NP', 'Fd', 'Fp', 'VS', 'PI', 'SP',
        'DA', 'Z', 'NC', 'SP', 'NP', 'Fc', 'VM', 'RG', 'SP', 'Zd', 'SP',
        'NC', 'CC', 'VS', 'DA', 'NC', 'RG', 'RG', 'VM', 'SP', 'NP', 'Fp'],
       dtype=object),
 array(['VM', 'DI', 'NC', 'SP', 'SP', 'Z', 'RG', 'CC', 'P0', 'VM', 'SP',
        'DA', 'NC', 'SP', 'DA', 'NC', 'AQ', 'AQ', 'Fp'], dtype=object),
 array(['DA', 'NC', 'SP', 'NP', 'VM', 'SP', 'DA', 'NC', 'SP', 'NP'

In [7]:
# Now creates a dataframe with the list of numpy array
# Each array is a row
# token_sentences_array
token_sentences_df = pd.DataFrame(token_sentences_array)
token_sentences_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1483,1484,1485,1486,1487,1488,1489,1490,1491,1492
0,Arequipa,es,el,nombre,de,el,asteroide,número,737,de,...,,,,,,,,,,
1,Conocido,previamente,con,la,designación,provisional,1912,QB,y,la,...,,,,,,,,,,
2,ENDOFARTICLE,.,,,,,,,,,...,,,,,,,,,,
3,La,provincia,de,Teherán,(,Farsi,:,),es,una,...,,,,,,,,,,
4,Tiene,una,extensión,de,alrededor de,18.637,km²,y,se,encuentra,...,,,,,,,,,,
5,La,provincia,de,Teherán,limita,con,las,provincias,de,Mazandarán,...,,,,,,,,,,
6,La,ciudad,de,Teherán,es,la,capital,tanto,de,la,...,,,,,,,,,,
7,Otros,distritos,son,:,Shemiranat,",",Rey,",",Islam Shahr,",",...,,,,,,,,,,
8,La,provincia,alcanzó,su,importancia,cuando,Teherán,sustituyó,",",como,...,,,,,,,,,,
9,Hoy,en,día,(,2004,),Teherán,se,encuentra,entre,...,,,,,,,,,,


In [8]:
# Now creates a dataframe with the list of numpy array
# Each array is a row
# tag_sentences_array
tag_sentences_df = pd.DataFrame(tag_sentences_array)
tag_sentences_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1483,1484,1485,1486,1487,1488,1489,1490,1491,1492
0,NP,VS,DA,NC,SP,DA,NC,NC,Z,SP,...,,,,,,,,,,
1,VM,RG,SP,DA,NC,AQ,Z,NP,CC,DA,...,,,,,,,,,,
2,NP,Fp,,,,,,,,,...,,,,,,,,,,
3,DA,NC,SP,NP,Fp,NP,Fd,Fp,VS,PI,...,,,,,,,,,,
4,VM,DI,NC,SP,SP,Z,RG,CC,P0,VM,...,,,,,,,,,,
5,DA,NC,SP,NP,VM,SP,DA,NC,SP,NP,...,,,,,,,,,,
6,DA,NC,SP,NP,VS,DA,NC,RG,SP,DA,...,,,,,,,,,,
7,DI,NC,VS,Fd,NP,Fc,NP,Fc,NP,Fc,...,,,,,,,,,,
8,DA,NC,VM,DP,NC,CS,NP,VM,Fc,CS,...,,,,,,,,,,
9,RG,SP,NC,Fp,Z,Fp,NP,P0,VM,SP,...,,,,,,,,,,
