In [32]:
import io
import pandas as pd

old_nltk_data = '/Users/liling.tan/nltk_data/'

In [33]:
# ABC corpus.
with io.open(old_nltk_data+'corpora/abc/rural.txt') as fin:
    rural_texts = [line.strip() for line in fin if line.strip()]
with io.open(old_nltk_data+'corpora/abc/science.txt', encoding='latin_1') as fin:
    science_texts = [line.strip().encode('utf8').decode('utf8') for line in fin if 
                    line.strip().encode('utf8').decode('utf8')]
    
abc_meta = {'title':'Australian Broadcasting Commission 2006',
            'source': 'http://www.abc.net.au/',
            'subcorpora': {'Rural News': {'source': 'http://www.abc.net.au/rural/news/'},
                           'Science News': {'source': 'http://www.abc.net.au/science/news/'}
                          }
           }

rural_df = pd.DataFrame({'text':rural_texts})
rural_df['subcorpora'] = 'Rural News'

science_df = pd.DataFrame({'text':science_texts})
science_df['subcorpora'] = 'Science News'

df_abc = pd.concat([rural_df, science_df])
df_abc.to_csv('nltk_data/corpora/abc.tsv', sep='\t', index=False)
df_abc = pd.read_csv('nltk_data/corpora/abc.tsv', sep='\t', 
                     dtype={'text':str, 'subcorpora':str})

In [62]:
# Brown
with open(old_nltk_data+'corpora/brown/cats.txt') as fin:
     categories = {line.strip().split(' ')[0]:line.strip().split(' ')[1] 
                   for line in fin}
        
brown_dir = old_nltk_data+'corpora/brown/'

rows = []
for filename in os.listdir(brown_dir):
    if filename in ['CONTENTS', 'cats.txt', 'README']:
        continue
    cat = categories[filename]
    with open(brown_dir+filename) as fin:
        i = -1
        for paragraph in fin.read().split('\n\n'):
            if not paragraph.strip():
                continue
            i += 1
            j = -1
            for sent in paragraph.split('\n'):
                if not sent.strip():
                    continue
                j += 1
                raw = sent.strip()
                text, pos = zip(*[word.split('/') for word in raw.split()])
                rows.append({'filename': filename, 
                              'para_id': i, 
                              'sent_id': j, 
                              'raw_text': raw, 
                              'tokenized_text': ' '.join(text), 
                              'tokenized_pos': ' '.join(pos), 
                              'label': cat})
                
df_brown = pd.DataFrame(rows)[['filename', 'para_id', 'sent_id', 
                              'raw_text', 'tokenized_text', 'tokenized_pos', 'label']]
df_brown.to_csv('nltk_data/corpora/brown.tsv', sep='\t', index=False)
df_brown = pd.read_csv('nltk_data/corpora/brown.tsv', sep='\t', 
                     dtype={'filename':str, 'para_id':int, 'sent_id':int,
                             'raw_text':str, 'tokenized_text':str, 'tokenized_pos':str,
                           'label':str})

In [107]:
# Gazetteers

gazetteers_filename2labels = {'mexstates.txt':'Mexico States',
                              'caprovinces.txt': 'Canada Provinces',
                              'usstateabbrev.txt': 'US State Abbreviations',
                              'uscities.txt': 'US Cities',
                              'countries.txt': 'Countries',
                              'isocountries.txt': 'Countries ISO codes',
                              'nationalities.txt': 'Nationalities',
                              'usstates.txt': 'US States'
                             }

rows = []
for filename in os.listdir(old_nltk_data+'corpora/gazetteers/'):
    if filename in ['LICENSE.txt']:
        continue
    label = gazetteers_filename2labels[filename]
    with io.open(old_nltk_data+'corpora/gazetteers/'+filename, encoding='ISO-8859-2') as fin:
        for line in fin:
            if line.strip():
                text = line.strip()
                if text == 'QuerĂŠtaro':
                    text = 'Querétaro'
                rows.append({'text':text, 'label':label})

df_gazetteers = pd.DataFrame(rows)[['text', 'label']]

#alpabet = list('abcdefghijklmnopqrstuvwxyz. ()-,') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
#alpabet += ["'"]
#[word for word in df_gazetteers['text'] if any(ch for ch in word if ch not in alpabet)]

df_gazetteers.to_csv('nltk_data/corpora/gazetteers.tsv', sep='\t', index=False)
df_gazetteers = pd.read_csv('nltk_data/corpora/gazetteers.tsv', sep='\t', 
                     dtype={'text':str, 'label':str})

Unnamed: 0,text,label
0,Aguascalientes,Mexico States
1,Baja California,Mexico States
2,Baja California Sur,Mexico States
3,Campeche,Mexico States
4,Chiapas,Mexico States
5,Chihuahua,Mexico States
6,Coahuila,Mexico States
7,Colima,Mexico States
8,Distrito Federal,Mexico States
9,Durango,Mexico States


['QuerĂŠtaro', 'Ĺland Islands', "Côte d'Ivoire", 'Réunion', 'Saint Barthélemy']

In [86]:
[w for w in gazetteers.words() if w.startswith('Quer')]

['QuerĂŠtaro']