# Read word tables and write word lists

## Try with table of English  

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('tables/English.tsv', sep='\t')
print(len(df))
df.head()

1475


Unnamed: 0,ID,Language_ID,Parameter_ID,Form,Segments,BorrowedScore,Essential,Borrowability
0,13-1-1-1,13,1-100,world,w ɜː l d,0.0,False,0.401887
1,13-1-21-1,13,1-210,land,l æ n d,0.0,False,0.268285
2,13-1-212-1,13,1-212,soil,s ɔɪ l,1.0,True,0.099935
3,13-1-213-1,13,1-213,dust,d ʌ s t,0.0,False,0.233766
4,13-1-214-1,13,1-214,mud,m ʌ d,0.5,False,0.153974


In [3]:
words = df.Segments.tolist()
print(len(words))
#words

1475


In [4]:
out_path = 'data/English.txt'
with open(out_path, 'w') as out:
    for word in words:
        out.write(' '+word+' \n')

## Make generic for all tables  

In [17]:
import pandas as pd

def makeWordData(language):
    df = pd.read_csv('tables/'+language+'.tsv', sep='\t')
    words = df.Segments.tolist()
    print('Language =', language,'; size =', len(df), '; len(words) =', len(words))

    with open('data/'+language+'.txt', 'w') as out:
        for word in words:
            out.write(' '+word+' \n')


In [18]:
makeWordData('English')

Language = English ; size = 1475 ; len(words) = 1475


In [19]:
makeWordData('Hup')
makeWordData('Imbabura Quechua')
makeWordData('Mapudungun')
makeWordData('Wichí')

Language = Hup ; size = 1179 ; len(words) = 1179
Language = Imbabura Quechua ; size = 1319 ; len(words) = 1319
Language = Mapudungun ; size = 1412 ; len(words) = 1412
Language = Wichí ; size = 1361 ; len(words) = 1361


## Separate train, validate, test - English 

In [5]:
df = pd.read_csv('tables/'+'English'+'.tsv', sep='\t')
words = df.Segments.tolist()
len(words)

1475

In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(words, test_size=0.15)
train, valid = train_test_split(train, test_size=len(test))
print(len(train), len(valid), len(test))

1031 222 222


In [7]:
with open('data/'+'English'+'.train.txt', 'w') as tr_out:
    for word in train:
        tr_out.write(' '+word+' \n')

## Make generic all tables  

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

def makeWordDatasets(language, split=0.15):
    df = pd.read_csv('tables/'+language+'.tsv', sep='\t')
    words = df.Segments.tolist()
    with open('data/'+language+'.txt', 'w') as out:
        for word in words:
            out.write(' '+word+' \n')
        
    trainval, test = train_test_split(words, test_size=split)
    print(language+' size of trainval, test =', len(trainval), len(test))
    with open('data/'+language+'.trainval.txt', 'w') as out:
        for word in trainval:
            out.write(' '+word+' \n')
    with open('data/'+language+'.test.txt', 'w') as out:
        for word in test:
            out.write(' '+word+' \n')
            
    train, valid = train_test_split(trainval, test_size=len(test))
    print(language+' size of train, valid =', len(train), len(valid))
    with open('data/'+language+'.train.txt', 'w') as out:
        for word in train:
            out.write(' '+word+' \n')
    with open('data/'+language+'.valid.txt', 'w') as out:
        for word in valid:
            out.write(' '+word+' \n')   


In [15]:
makeWordDatasets('English', split=0.15)

English size of trainval, test = 1253 222
English size of train, valid = 1031 222


In [16]:
makeWordDatasets('Hup', split=0.15)
makeWordDatasets('Imbabura Quechua', split=0.15)
makeWordDatasets('Mapudungun', split=0.15)
makeWordDatasets('Wichí', split=0.15)

Hup size of trainval, test = 1002 177
Hup size of train, valid = 825 177
Imbabura Quechua size of trainval, test = 1121 198
Imbabura Quechua size of train, valid = 923 198
Mapudungun size of trainval, test = 1200 212
Mapudungun size of train, valid = 988 212
Wichí size of trainval, test = 1156 205
Wichí size of train, valid = 951 205


## Verify generic tables  
- Reads all words into list  
- Splits words on whitespace, dropping all whitespace including initial and final.

### Determine largest segmented word size
- largest is 22  
- could keep with 35 to handle most other languages - basic vocabulary.  

In [17]:
# Test to read and segment words.
language = 'English'
with open('data/'+language+'.train.txt', 'r') as into:
    words = into.read().splitlines()
#print(words)
print(words[0].split())

['t', 'iː']


In [18]:
maxsz = 0
for word in words:
    sz = len(word.split())
    maxsz = max(sz, maxsz)
    
print(maxsz)

10


In [20]:
import glob , os
os.chdir("./data")
files = glob.glob("*.txt")
    
maxsz = 0
for file in files:
    with open(file, 'r') as into:
        words = into.read().splitlines()
        for word in words:
            maxsz = max(maxsz, len(word.split()))
            
print(maxsz)            

22
