In [60]:
import io
import json
import os
import re
from collections import OrderedDict
from shutil import copyfile
import xml.etree.cElementTree as ET

import pandas as pd

import nltk

from lazyme import per_section, deduplicate, find_files
from tqdm import tqdm
import pickle

from collections import Counter, defaultdict

In [56]:
old_nltk_data = nltk.data.path[0]

new_nltk_data = "packages/"

# ABC Corpus


In [86]:
# ABC corpus.
directory = new_nltk_data+'/corpora/abc/'
if not os.path.exists(directory):
    os.makedirs(directory)

with io.open(old_nltk_data+'/corpora/abc/rural.txt') as fin:
    rural_texts = [line.strip() for line in fin if line.strip()]
with io.open(old_nltk_data+'/corpora/abc/science.txt', encoding='latin_1') as fin:
    science_texts = [line.strip().encode('utf8').decode('utf8') for line in fin if 
                    line.strip().encode('utf8').decode('utf8')]

rural_df = pd.DataFrame({'text':rural_texts})
rural_df['subcorpora'] = 'Rural News'

science_df = pd.DataFrame({'text':science_texts})
science_df['subcorpora'] = 'Science News'

df_abc = pd.concat([rural_df, science_df])
df_abc.to_csv(new_nltk_data+'/corpora/abc/abc.tsv', sep='\t', index=False)
df_abc = pd.read_csv(new_nltk_data+'/corpora/abc/abc.tsv', sep='\t', 
                     dtype={'text':str, 'subcorpora':str})

abc_meta = {'title':'Australian Broadcasting Commission 2006',
            'source': 'http://www.abc.net.au/',
            'subcorpora': {'Rural News': {'source': 'http://www.abc.net.au/rural/news/'},
                           'Science News': {'source': 'http://www.abc.net.au/science/news/'}
                          },
             'xml': {'id':'abc', 'name':"Australian Broadcasting Commission 2006",
                     'webpage':"http://www.abc.net.au/", 'author':"Australian Broadcasting Commission",
                      'unzip':"1"}
           }


abc_xml = ET.Element("package", id="abc", name="Australian Broadcasting Commission 2006",
                  webpage="http://www.abc.net.au/", author="Australian Broadcasting Commission",
                  unzip="1")
tree = ET.ElementTree(root)
tree.write(new_nltk_data+'/corpora/abc/abc.xml')

with open(new_nltk_data+'/corpora/abc/abc-meta.json', 'w') as fout:
    json.dump(abc_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))
    

In [87]:
df_abc.head()

Unnamed: 0,text,subcorpora
0,PM denies knowledge of AWB kickbacks,Rural News
1,The Prime Minister has denied he knew AWB was ...,Rural News
2,Letters from John Howard and Deputy Prime Mini...,Rural News
3,In one of the letters Mr Howard asks AWB manag...,Rural News
4,The Opposition's Gavan O'Connor says the lette...,Rural News


# Brown Corpus

In [85]:
# Brown

directory = new_nltk_data+'/corpora/brown/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
    
with open(old_nltk_data+'/corpora/brown/cats.txt') as fin:
     categories = {line.strip().split(' ')[0]:line.strip().split(' ')[1] 
                   for line in fin}
        
brown_dir = old_nltk_data+'/corpora/brown/'

rows = []
for filename in os.listdir(brown_dir):
    if filename in ['CONTENTS', 'cats.txt', 'README']:
        continue
    cat = categories[filename]
    with open(brown_dir+filename) as fin:
        i = -1
        for paragraph in fin.read().split('\n\n'):
            if not paragraph.strip():
                continue
            i += 1
            j = -1
            for sent in paragraph.split('\n'):
                if not sent.strip():
                    continue
                j += 1
                raw = sent.strip()
                text, pos = zip(*[word.split('/') for word in raw.split()])
                rows.append({'filename': filename, 
                              'para_id': i, 
                              'sent_id': j, 
                              'raw_text': raw, 
                              'tokenized_text': ' '.join(text), 
                              'tokenized_pos': ' '.join(pos), 
                              'label': cat})
                
                
df_brown = pd.DataFrame(rows)[['filename', 'para_id', 'sent_id', 
                              'raw_text', 'tokenized_text', 'tokenized_pos', 'label']]
df_brown.to_csv(new_nltk_data+'/corpora/brown/brown.tsv', sep='\t', index=False)

df_brown = pd.read_csv(new_nltk_data+'/corpora/brown/brown.tsv', sep='\t', 
                     dtype={'filename':str, 'para_id':int, 'sent_id':int,
                             'raw_text':str, 'tokenized_text':str, 'tokenized_pos':str,
                           'label':str})

df_brown_cats = df_brown[['filename', 'label']].drop_duplicates().sort_values('filename')
df_brown_cats.to_csv(new_nltk_data+'/corpora/brown/cats.tsv', sep='\t', index=False)

brown_readme = """BROWN CORPUS

A Standard Corpus of Present-Day Edited American
English, for use with Digital Computers.

by W. N. Francis and H. Kucera (1964)
Department of Linguistics, Brown University
Providence, Rhode Island, USA

Revised 1971, Revised and Amplified 1979

http://www.hit.uib.no/icame/brown/bcm.html

Distributed with the permission of the copyright holder,
redistribution permitted."""

brown_meta = {'title':'Brown Corpus',
              'description': str('A Standard Corpus of Present-Day Edited American English, '
                               'for use with Digital Computers.'),
              'authors': 'W. N. Francis and H. Kucera (1964)',
              'url': 'http://www.hit.uib.no/icame/brown/bcm.html',
              'readme': brown_readme}

with open(new_nltk_data+'/corpora/brown/brown-meta.json', 'w') as fout:
    json.dump(brown_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

In [89]:
df_brown.head()

Unnamed: 0,filename,para_id,sent_id,raw_text,tokenized_text,tokenized_pos,label
0,cd05,0,0,"Furthermore/rb ,/, as/cs an/at encouragement/n...","Furthermore , as an encouragement to revisioni...","rb , cs at nn in nn nn , pps rb bez jj to vb c...",religion
1,cd05,0,1,The/at Unitarian/jj clergy/nns were/bed an/at ...,The Unitarian clergy were an exclusive club of...,at jj nns bed at jj nn in vbn nns -- cs at nn ...,religion
2,cd05,0,2,"Ezra/np Stiles/np Gannett/np ,/, an/at honorab...","Ezra Stiles Gannett , an honorable representat...","np np np , at jj nn in at nn , vbd ppl rb in a...",religion
3,cd05,0,3,"Even/rb so/rb ,/, Gannett/np judiciously/rb ar...","Even so , Gannett judiciously argued , the Ass...","rb rb , np rb vbd , at nn-tl md rb vb cs np ``...",religion
4,cd05,0,4,We/ppss today/nr are/ber not/* entitled/vbn to...,We today are not entitled to excoriate honest ...,ppss nr ber * vbn to vb jj nns wps vbd np to b...,religion


# Gazetteers

In [96]:
# Gazetteers

directory = new_nltk_data+'/corpora/gazetteers/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
gazetteers_filename2labels = {'mexstates.txt':'Mexico States',
                              'caprovinces.txt': 'Canada Provinces',
                              'usstateabbrev.txt': 'US State Abbreviations',
                              'uscities.txt': 'US Cities',
                              'countries.txt': 'Countries',
                              'isocountries.txt': 'Countries ISO codes',
                              'nationalities.txt': 'Nationalities',
                              'usstates.txt': 'US States'
                             }

rows = []
for filename in os.listdir(old_nltk_data+'/corpora/gazetteers/'):
    if filename in ['LICENSE.txt']:
        continue
    label = gazetteers_filename2labels[filename]
    with io.open(old_nltk_data+'/corpora/gazetteers/'+filename, encoding='ISO-8859-2') as fin:
        for line in fin:
            if line.strip():
                text = line.strip()
                if text == 'QuerĂŠtaro':
                    text = 'Querétaro'
                rows.append({'text':text, 'label':label})

df_gazetteers = pd.DataFrame(rows)[['text', 'label']]

#alpabet = list('abcdefghijklmnopqrstuvwxyz. ()-,') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
#alpabet += ["'"]
#[word for word in df_gazetteers['text'] if any(ch for ch in word if ch not in alpabet)]

df_gazetteers.to_csv(new_nltk_data + '/corpora/gazetteers/gazetteers.tsv', sep='\t', index=False)
df_gazetteers = pd.read_csv(new_nltk_data + '/corpora/gazetteers/gazetteers.tsv', sep='\t', 
                     dtype={'text':str, 'label':str})

gazetteers_filename2labels = {'mexstates.txt':'Mexico States',
                              'caprovinces.txt': 'Canada Provinces',
                              'usstateabbrev.txt': 'US State Abbreviations',
                              'uscities.txt': 'US Cities',
                              'countries.txt': 'Countries',
                              'isocountries.txt': 'Countries ISO codes',
                              'nationalities.txt': 'Nationalities',
                              'usstates.txt': 'US States'
                             }

gazetteers_meta = {'title':'Geolocation Gazeteers',
                    'subcorpora': {'Mexico States': {'original_file': 'mexstates.txt'},
                                   'Canada Provinces': {'original_file': 'caprovinces.txt'},
                                   'US State Abbreviations': {'original_file': 'usstates.txt'},
                                   'US States': {'original_file': 'usstateabbrev.txt'},
                                   'US Cities': {'original_file':'uscities.txt',
                                                 'source': 'http://en.wikipedia.org/wiki/List_of_cities_in_the_United_States_with_over_100%2C000_people',
                                                 'license': 'GNU Free Documentation License',
                                                 'license_url': 'http://www.gnu.org/copyleft/fdl.html'
                                                },
                                   'Countries': {'original_file':'countries.txt',
                                                 'source':'http://en.wikipedia.org/wiki/List_of_countries',
                                                 'license': 'GNU Free Documentation License',
                                                 'license_url': 'http://www.gnu.org/copyleft/fdl.html'
                                                },
                                   'Countries ISO codes': {'original_file': 'isocountries.txt',
                                                          'source': 'http://www.guavastudios.com/country-list.htm'
                                                          },
                                   'Nationalities': {'original_file': 'nationalities.txt',
                                                    'source': 'http://www.guavastudios.com/nationalities-list.htm'
                                                    },
                                  }
                    }

with open(new_nltk_data+'/corpora/gazetteers/gazetteers-meta.json', 'w') as fout:
    json.dump(gazetteers_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

# Words

In [98]:
# Words

directory = new_nltk_data+'/corpora/words/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
en_words = []
with open(old_nltk_data+'/corpora/words/en') as fin:
    for line in fin:
        en_words.append(line.strip())

basic_en_words = []
with open(old_nltk_data+'/corpora/words/en-basic') as fin:
    for line in fin:
        basic_en_words.append(line.strip())
        
words_meta = {'title':'Word Lists',
              'subcorpora': {'Unix Words':{'source':'http://en.wikipedia.org/wiki/Words_(Unix)'},
                           'Ogden Basic English': {'title': 'The ABC of Basic English',
                                                   'author':'C.K. Ogden (1932)'}
                          }
            }

unix_words = pd.DataFrame({'text':en_words})
ogden_words = pd.DataFrame({'text':basic_en_words})

unix_words.to_csv(new_nltk_data + '/corpora/words/unix_words.tsv', sep='\t', index=False)
ogden_words.to_csv(new_nltk_data + '/corpora/words/ogden_words.tsv', sep='\t', index=False)

unix_words = pd.read_csv(new_nltk_data + '/corpora/words/unix_words.tsv', sep='\t', dtype={'text':str})
ogden_words = pd.read_csv(new_nltk_data + '/corpora/words/ogden_words.tsv', sep='\t', dtype={'text':str})

with open(new_nltk_data+'/corpora/words/words-meta.json', 'w') as fout:
    json.dump(words_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

# Movie Review

In [107]:

directory = new_nltk_data+'/corpora/movie_reviews/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
rows = []

for filename in sorted(os.listdir(old_nltk_data+'/corpora/movie_reviews/pos/')):
    fold, html_id = filename[:-4].split('_')
    fold_id = int(int(fold[2:]) / 100)
    
    with open(old_nltk_data+'/corpora/movie_reviews/pos/'+filename) as fin:
        for sent_id, line in enumerate(fin):
                rows.append({'fold_id':fold_id, 
                             'cv_tag':fold, 
                             'html_id':html_id, 
                             'sent_id':sent_id, 
                             'text':line.strip(),
                             'tag':'pos'
                            })
                
for filename in sorted(os.listdir(old_nltk_data+'/corpora/movie_reviews/neg/')):
    fold, html_id = filename[:-4].split('_')
    fold_id = int(int(fold[2:]) / 100)

    with open(old_nltk_data+'/corpora/movie_reviews/neg/'+filename) as fin:
        for sent_id, line in enumerate(fin):
                rows.append({'fold_id':fold_id, 
                             'cv_tag':fold, 
                             'html_id':html_id, 
                             'sent_id':sent_id, 
                             'text':line.strip(),
                             'tag':'neg'
                            })
                
df_movie_reivews = pd.DataFrame(rows)[['fold_id', 'cv_tag', 'html_id', 'sent_id', 'text', 'tag']]

df_movie_reivews.to_csv(new_nltk_data + '/corpora/movie_reviews/movie_review.tsv', sep='\t', index=False)
df_movie_reivews = pd.read_csv(new_nltk_data + '/corpora/movie_reviews/movie_review.tsv', sep='\t', 
                     dtype={'fold_id':int, 'cv_tag':str, 'html_id':str, 'sent_id':int,
                            'text':str, 'tag':str})


mr_bibtext = """@InProceedings{Pang+Lee:04a,
  author =       {Bo Pang and Lillian Lee},
  title =        {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
  booktitle =    "Proceedings of the ACL",
  year =         2004
}"""



mr_meta = {'title': 'Sentiment Polarity Dataset Version 2.0',
           'aka': 'Moview Review Data',
           'source': 'http://www.cs.cornell.edu/people/pabo/movie-review-data/',
           'authors': 'Bo Pang and Lillian Lee',
           'license': 'Distributed with NLTK with permission from the authors.',
            'bibtex':mr_bibtext}

with open(new_nltk_data+'/corpora/movie_reviews/movie_reviews-meta.json', 'w') as fout:
    json.dump(mr_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

# Webtext

In [113]:
directory = new_nltk_data+'/corpora/webtext/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
rows = []
for filename in sorted(os.listdir(old_nltk_data+'/corpora/webtext/')):
    if not filename.endswith('.txt'):
        continue
    
    subcorp = filename.split('.')[0]
    with open(old_nltk_data+'/corpora/webtext/' + filename, encoding='latin-1') as fin:
        for line in fin:
            rows.append({'text': line.strip(), 'domain': subcorp})
df_webtext = pd.DataFrame(rows)[['text', 'domain']]


df_webtext.to_csv(new_nltk_data +'/corpora/webtext/webtext.tsv', sep='\t', index=False)
df_webtext = pd.read_csv(new_nltk_data + '/corpora/webtext/webtext.tsv', sep='\t', 
                     dtype={'text':str, 'domain':str})



webtext_meta = {'title':'Web Text Corpus',
                   'description': str("This is a collection of diverse, contemporary text genres, "
                                      "collected by scraping publicly accessible archives of web postings. "
                                      "This data is disseminated in preference to publishing URLs for "
                                       "individuals to download and clean up (the usual model for web corpora)."),
                   
                    'subcorpora': {'firefox': {'original_file': 'firefox.txt', 
                                               'description': 'Firefox support forum'},
                                   'overheard': {'original_file': 'overheard.txt', 
                                               'description': 'Overheard in New York (partly censored)', 
                                                'source': 'http://www.overheardinnewyork.com/', 
                                                'year': '2006'},
                                   'pirate': {'original_file': 'pirate.txt', 
                                               'description': "Movie script from Pirates of the Caribbean: Dead Man's Chest",
                                                'source': 'http://www.overheardinnewyork.com/', 
                                                'year': '2006'},
                                   'grail': {'original_file': 'grail.txt', 
                                               'description': 'Movie script from Monty Python and the Holy Grail',
                                                'source': 'http://www.textfiles.com/media/SCRIPTS/grail', 
                                                'year': '2006'},
                                   'singles': {'original_file': 'singles.txt', 
                                               'description': 'Singles ads',
                                                'source': 'http://search.classifieds.news.com.au/',},
                                   'wine': {'original_file': 'wine.txt', 
                                               'description': 'Fine Wine Diary',
                                                'source': 'http://www.finewinediary.com/', 
                                                'year': '2005-6'},
                                  }
                    }

with open(new_nltk_data+'/corpora/webtext/webtext-meta.json', 'w') as fout:
    json.dump(webtext_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

# Names

In [116]:
directory = new_nltk_data+'/corpora/names/'
if not os.path.exists(directory):
    os.makedirs(directory)
    

rows =[]
for filename in sorted(os.listdir(old_nltk_data+'/corpora/names/')):
    if not filename.endswith('.txt'):
        continue
    
    label = filename.split('.')[0]
    with open(old_nltk_data+'/corpora/names/' + filename) as fin:
        for line in fin:
            rows.append({'text': line.strip(), 'gender': label})
        
df_names = pd.DataFrame(rows)[['text', 'gender']]
df_names.to_csv(new_nltk_data+'/corpora/names/names.tsv', sep='\t', index=False)
df_names = pd.read_csv(new_nltk_data+'/corpora/names/names.tsv', sep='\t', 
                     dtype={'text':str, 'gender':str})
            
names_readme = """Names Corpus, Version 1.3 (1994-03-29)
Copyright (C) 1991 Mark Kantrowitz
Additions by Bill Ross

This corpus contains 5001 female names and 2943 male names, sorted
alphabetically, one per line.

You may use the lists of names for any purpose, so long as credit is
given in any published work. You may also redistribute the list if you
provide the recipients with a copy of this README file. The lists are
not in the public domain (I retain the copyright on the lists) but are
freely redistributable.  If you have any additions to the lists of
names, I would appreciate receiving them.

Mark Kantrowitz <mkant+@cs.cmu.edu>
http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/"""
    
names_meta = {'title': 'Names Corpus',
              'version': '1.3 (1994-03-29)',
              'description': 'This corpus contains 5001 female names and 2943 male names',
              'authors': 'Mark Kantrowitz, Bill Ross',
              'license': 'Copyright (C) 1991 Mark Kantrowitz',
              'source': 'http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/',
              'readme': names_readme}


with open(new_nltk_data+'/corpora/names/names-meta.json', 'w') as fout:
    json.dump(names_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

# State of the Union

```python
import requests
from bs4 import BeautifulSoup

all_sotu = {}

text_url = 'http://stateoftheunion.onetwothree.net/texts/'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

for li in BeautifulSoup(requests.get(text_url + 'index.html').content).find_all('li'):
    if not li.find('a')['href']:
        continue

    sotu = text_url + li.find('a')['href']
    if sotu.split('/')[-1].split('.')[0].isdigit():
        year = li.find('a').text.split(', ')[-1]
        if year in all_sotu:
            continue
        else:
            soup = BeautifulSoup(requests.get(sotu, headers=headers).content)
            name = soup.find('h2').text
            date = soup.find('h3').text
            year = date.split(', ')[1]
            print(year, end=', ')
            texts = [str(p) for p in soup.find_all('p')]
            all_sotu[year] = {'year':year, 'date':date, 'name':name, 'texts':'\n\n'.join(texts).strip()}
            
for year in all_sotu:
    lastname = all_sotu[year]['name'].split()[-1]
    with open(f'sotu/{lastname}-{year}', 'w') as fout:
        print(all_sotu[year]['texts'].strip(), file=fout)
```

In [12]:
directory = new_nltk_data+'/corpora/state_union/'
if not os.path.exists(directory):
    os.makedirs(directory)


df_sotu = pd.read_csv('stateunion.tsv', sep='\t', 
                     dtype={'texts':str, 'date':str, 'name':str},
                     index_col=0)

df_sotu.T['texts'] = ['\n\n'.join([deduplicate(' '.join(para.split('\n')), ' ') 
                       for para in re.sub('<[^<]+?>', '', raw).strip().split('\n\n')])
                      for raw in df_sotu.T.texts]

sotu_meta = {'title': 'State of the Union: Addrresses', 
             'author': 'Brad Borevitz', 
             'source': 'http://stateoftheunion.onetwothree.net/texts/',
             'year': '1790-2018'
            }

with open(new_nltk_data+'/corpora/state_union/state_union-meta.json', 'w') as fout:
    json.dump(sotu_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))
    
df_sotu.to_csv(new_nltk_data+'/corpora/state_union/state_union.tsv', sep='\t', index=False)


In [137]:
df_sotu

Unnamed: 0,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
date,"January 8, 1790","October 25, 1791","November 6, 1792","December 3, 1793","November 19, 1794","December 8, 1795","December 7, 1796","November 22, 1797","December 8, 1798","December 3, 1799",...,"February 24, 2009","January 27, 2010","January 25, 2011","January 24, 2012","February 12, 2013","January 28, 2014","January 20, 2015","January 12, 2016","February 28, 2017","January 30, 2018"
name,George Washington,George Washington,George Washington,George Washington,George Washington,George Washington,George Washington,John Adams,John Adams,John Adams,...,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Donald J. Trump,Donald J. Trump
texts,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Gentlemen of the Senate and Gentlemen of the H...,Gentlemen of the Senate and Gentlemen of the H...,Gentlemen of the Senate and Gentlemen of the H...,...,"Madame Speaker, Mr. Vice President, Members of...","Madame Speaker, Vice President Biden, Members ...","Mr. Speaker, Mr. Vice President, members of Co...","Mr. Speaker, Mr. Vice President, members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Thank you very much. Mr. Speaker, Mr. Vice Pre...","Mr. Speaker, Mr. Vice President, Members of Co..."


Unnamed: 0,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
date,"January 8, 1790","October 25, 1791","November 6, 1792","December 3, 1793","November 19, 1794","December 8, 1795","December 7, 1796","November 22, 1797","December 8, 1798","December 3, 1799",...,"February 24, 2009","January 27, 2010","January 25, 2011","January 24, 2012","February 12, 2013","January 28, 2014","January 20, 2015","January 12, 2016","February 28, 2017","January 30, 2018"
name,George Washington,George Washington,George Washington,George Washington,George Washington,George Washington,George Washington,John Adams,John Adams,John Adams,...,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Barack Obama,Donald J. Trump,Donald J. Trump
texts,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...,Gentlemen of the Senate and Gentlemen of the H...,Gentlemen of the Senate and Gentlemen of the H...,Gentlemen of the Senate and Gentlemen of the H...,...,"Madame Speaker, Mr. Vice President, Members of...","Madame Speaker, Vice President Biden, Members ...","Mr. Speaker, Mr. Vice President, members of Co...","Mr. Speaker, Mr. Vice President, members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Mr. Speaker, Mr. Vice President, Members of Co...","Thank you very much. Mr. Speaker, Mr. Vice Pre...","Mr. Speaker, Mr. Vice President, Members of Co..."


In [139]:
for filename in os.listdir('sotu'):
    with open('sotu/'+filename) as fin, open('sotu-clean/'+filename, 'w') as fout:
        fout.write(re.sub('<[^<]+?>', '', fin.read()).strip())

City.db
=====

In [45]:
from nltk.sem.chat80 import cities2table, sql_query
from sqlite3 import OperationalError
try:
    cities2table('cities.pl', 'city', 'city.db', verbose=True, setup=True)
except OperationalError:
    pass 

directory = new_nltk_data+'/corpora/city_database/'
if not os.path.exists(directory):
    os.makedirs(directory)

with open(directory+'city.tsv', 'w') as fout:
    for row in sql_query('corpora/city_database/city.db', "SELECT * FROM city_table"):
        city, country, population = row
        print('\t'.join([city, country, str(population)]), end='\n', file=fout)
    
chat80_meta = {'title': 'Chat80', 
             'author': '', 
             'source': '',
            }


directory = new_nltk_data+'/corpora/chat80/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
# contain.pl
with open(directory+'contain.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/contain.pl') as fin:
        for line in fin:
            matches = re.findall(r'^contains0\((.*),(.*)\)\.$', line)
            if matches:
                country, contains = matches[0]
                print('\t'.join([country, contains]), end='\n', file=fout)


# borders.pl
with open(directory+'borders.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/borders.pl') as fin:
        for line in fin:
            matches = re.findall(r'borders\((.*),(.*)\)\.$', line)
            if matches:
                query, bordering = matches[0]
                print('\t'.join([query, bordering]), end='\n', file=fout)
                
# cities.pl
with open(directory+'cities.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/cities.pl') as fin:
        for line in fin:
            matches = re.findall(r'city\((.*),(.*),(.*)\)\.$', line)
            if matches:
                city, country, population = matches[0]
                print('\t'.join([city, country, population]), end='\n', file=fout)
                
# countries.pl
with open(directory+'countries.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/countries.pl') as fin:
        for line in fin:
            matches = re.findall(r'country\((.*),(.*),(.*),(.*),(.*),(.*),(.*),(.*)\)\.$', line)
            if matches:
                country, region, latitude, longtitude, area, population, capital, currency = matches[0]
                print('\t'.join(matches[0]), end='\n', file=fout)

# rivers.pl 
with open(directory+'rivers.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/rivers.pl') as fin:
        for line in fin:
            matches = re.findall(r'river\((.*),\[(.*)\]\)\.$', line)
            if matches:
                river, flows_thru = matches[0]
                print('\t'.join([river, str(flows_thru.split(','))]), file=fout, end='\n')
            
# world1.pl
with open(directory+'world1-circle-of-latitutde.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/world1.pl') as fin:
        for line in fin:
            if line.startswith('circle_of_latitude'):
                matches = re.findall(r'(circle_of_latitude)\((.*),(.*)\)\.', line)
                rel, circle, number = matches[0]
                print('\t'.join(['circle_of_latitude', circle, number]), end='\n', file=fout)
                
with open(directory+'world1-in-continent.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/world1.pl') as fin:
        for line in fin:
            if line.startswith('in_continent'):
                matches = re.findall(r'in_continent\((.*)\,(.*)\)\.', line)
                region, continent  = matches[0]
                print('\t'.join(['in_continent', region, continent]), end='\n', file=fout)
                
with open(directory+'world1-continent-ocean-sea.tsv', 'w') as fout:
    with open(old_nltk_data + '/corpora/chat80/world1.pl') as fin:
        for line in fin:
            if not line.startswith('in_continent'):
                matches = re.findall(r'(continent|ocean|sea)\((.*)\)\.', line)
                if matches:
                    rel, entity = matches[0]
                    print('\t'.join([rel, entity]), end='\n', file=fout)

# Dolch

In [59]:
from lazyme import find_files

dolch_meta = {'title':'Dolch Word List',
            'description': str(
                           "This corpus contains a list of frequently used English words, grouped according to their part of speech."
                           "These are 220 sight words that make up most of children's reading materials."),
            'cite': 'Dolch, E. W. (1936). A basic sight vocabulary. The Elementary School Journal, 36(6), 456--460.',
           }

directory = new_nltk_data+'/corpora/dolch/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
with open(directory+'dolch.tsv', 'w') as fout:
    print('\t'.join(['pos', 'word']), end='\n', file=fout)
    for filename in find_files(old_nltk_data+'/corpora/dolch/', '*'):
        if not filename.lower().endswith('readme'):
            with open(filename) as fin:
                pos = filename.split('/')[-1]
                for line in fin:
                    print('\t'.join([pos, line.strip()]), end='\n', file=fout)

with open(new_nltk_data+'/corpora/dolch/dolch-meta.json', 'w') as fout:
    json.dump(dolch_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

# Comtrans

In [3]:
from lazyme import find_files, per_chunk

format_description = """The data is in giza++ format, consisting of triples of
L1, L2, and alignments, e.g.:

English-French:
Resumption of the session
Reprise de la session
0-0 1-1 2-2 3-3

German-English:
Wiederaufnahme der Sitzungsperiode
Resumption of the session
0-0 1-1 1-2 2-3

German-French:
Wiederaufnahme der Sitzungsperiode
Reprise de la session
0-0 1-1 1-2 2-3"""

comtrans_meta = {'title':'COMTRANS Corpus Sample',
            'description': str(
                           "3.3% of the COMTRANS data, distributed with permission.\n\n"
                           )+format_description,
            'authors': 'Reinhard Rapp',
            'source': 'http://www.fask.uni-mainz.de/user/rapp/comtrans/',
            'cite': '',
           }

directory = new_nltk_data+'/corpora/comtrans/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
with open(new_nltk_data+'/corpora/comtrans/comtrans-meta.json', 'w') as fout:
    json.dump(comtrans_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

In [26]:
with open(directory + 'comtrans-sample.tsv', 'w') as fout:
    print('\t'.join(['filename', 
                     'src_lang', 'trg_lang', 'idx', 
                    'src', 'trg', 'alignment']), file=fout, end='\n')
    
    for filename in find_files(old_nltk_data + '/corpora/comtrans/', '*.txt'):
        x = filename.split('/')[-1].split('.')[0].split('-')
        _, src, trg = x
        with open(filename, encoding='latin-1') as fin:

            for idx, three_lines in enumerate(per_chunk(fin, n=3)):
                srcline, trgline, align = three_lines
                srcline = srcline.strip().encode('utf-8').decode('utf-8')
                trgline = trgline.strip().encode('utf-8').decode('utf-8')
                align = align.strip().encode('utf-8').decode('utf-8')
                print('\t'.join([filename.split('/')[-1], 
                                 src, trg, str(idx), 
                                 srcline, trgline, align]),
                      file=fout, end='\n'
                     )


In [27]:
with open(directory + 'comtrans-full.tsv', 'w') as fout:
    print('\t'.join(['filename', 
                     'src_lang', 'trg_lang', 'idx', 
                    'src', 'trg', 'alignment']), file=fout, end='\n')
    
    for filename in find_files(old_nltk_data + '/corpora/comtrans-full/', '*.txt'):
        x = filename.split('/')[-1].split('.')[0].split('-')
        _, _, src, trg, _  = x
        with open(filename, encoding='latin-1') as fin:

            for idx, three_lines in enumerate(per_chunk(fin, n=3)):
                srcline, trgline, align = three_lines
                srcline = srcline.strip().encode('utf-8').decode('utf-8')
                trgline = trgline.strip().encode('utf-8').decode('utf-8')
                align = align.strip().encode('utf-8').decode('utf-8')
                print('\t'.join([filename.split('/')[-1], 
                                 src, trg, str(idx), 
                                 srcline, trgline, align]),
                      file=fout, end='\n'
                     )


Crubadan
====

In [39]:
crubadan_readme = """
Language Id Corpus
Kevin Scannell

This directory contains 3-gram frequencies for 449 writing systems 
gathered by the web crawler "An Crúbadán", as of 11 April 2010.
See http://borel.slu.edu/crubadan/ for more information.

The web crawler works at the level of "writing systems" vs. "languages",
so for example Serbian Cyrillic and Serbian Latin are treated
separately, as are Portuguese as spoken in Brazil vs. Portugal, etc.
The 3-gram files are named using 2- or 3-letter "writing system codes"
that were never intended to be exposed to the outside world.
We are working on establishing a mapping between our codes and
the writing systems laid out in Oliver Streiter's XNL-RDF database.  

The file table.txt lists all 449 writing systems.  The first column
contains the internal Crúbadán code, the second column contains the
ISO 639-3 code for the language represented by the writing system, and
the third column is an English language description.

Copyright 2010 Kevin P. Scannell <kscanne at gmail dot com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

crubadan_meta = {'title':'Crúbadán Language Id Corpus',
            'description': str(
                            "This directory contains 3-gram frequencies for 449 writing systems\n"
                            'gathered by the web crawler "An Crúbadán", as of 11 April 2010.\n'
                            "See http://borel.slu.edu/crubadan/ for more information.\n"
                           ),
            'readme': crubadan_readme,
            'authors': 'Kevin Scannell',
            'source': 'http://borel.slu.edu/crubadan',
            'cite': '',
           }

directory = new_nltk_data+'/corpora/crubadan/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
with open(new_nltk_data+'/corpora/crubadan/crubadan-meta.json', 'w') as fout:
    json.dump(crubadan_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))
    
    
language_mappings = []
iso6392_to_iso6393 = {}
with open(old_nltk_data+'/corpora/crubadan/table.txt') as fin:
    for line in fin:
        iso6392, iso6393, lang = line.strip().split('\t')
        language_mappings.append({'iso639-2':iso6392, 'iso639-3':iso6393, 'language':lang})
        iso6392_to_iso6393[iso6392] = iso6393
df_lang = pd.DataFrame.from_dict(language_mappings)  

df_lang.to_csv(new_nltk_data+'/corpora/crubadan/language_mapping.tsv', sep='\t', index=False)



rows = []
trigram_counter = defaultdict(Counter)
for filename in find_files(old_nltk_data+'/corpora/crubadan/', '*3grams.txt'):
    lang = iso6392_to_iso6393[filename.split('/')[-1].split('-')[0]]
    with open(filename) as fin:
        for line in fin:
            count, gram = line.strip().split(' ')
            trigram_counter[lang][gram] = int(count)
            rows.append({'lang':lang, 'trigram':gram, 'count':int(count)})
            
df_crubadan = pd.DataFrame.from_dict(rows)

df_crubadan.to_csv(new_nltk_data +'/corpora/crubadan/crubadan.tsv', sep='\t', index=False)

with open(new_nltk_data +'/corpora/crubadan/crubadan.pkl', 'wb') as fout:
    pickle.dump(trigram_counter, fout)

# Machado

In [57]:
from nltk.corpus import machado
from nltk.data import LazyLoader

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

def _read_para_block(stream):
    paras = []
    for para in read_blankline_block(stream):
        paras.append(
            [
                sent.replace('\n', ' ')
                for sent in sent_tokenize_pt(para)
            ]
        )
    return paras

def machado_paras(fileid):
    return concat(
                [
                    machado.CorpusView(path, _read_para_block, encoding=enc)
                    for (path, enc, fileid) in machado.abspaths(fileid, True, True)
                ]
            )

sent_tokenize_pt = LazyLoader("tokenizers/punkt/portuguese.pickle").tokenize

In [69]:
machado_meta = {'title':'Machado de Assis -- Obra Completa',
            'description': str(
                            "This directory contains 3-gram frequencies for 449 writing systems\n"
                            'gathered by the web crawler "An Crúbadán", as of 11 April 2010.\n'
                            "See http://borel.slu.edu/crubadan/ for more information.\n"
                           ),
            'readme': machado.readme(),
            'authors': '',
            'source': 'http://machado.mec.gov.br',
            'cite': '',
           }

directory = new_nltk_data+'/corpora/machado/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
with open(new_nltk_data+'/corpora/machado/machado-meta.json', 'w') as fout:
    json.dump(machado_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

In [67]:
with open(new_nltk_data +'/corpora/machado/machado.tsv', 'w') as fout:
    print('\t'.join(['genre', 'filename', 'para_idx', 'sent_idx', 'sent']), end='\n', file=fout)
    for filepath in tqdm(machado.fileids()):
        genre, filename = filepath.split('/')
        for para_idx, para in enumerate(machado_paras(filepath)):
            for sent_idx, sent in enumerate(para):
                print('\t'.join(map(str, [genre, filename, para_idx, sent_idx, sent])), end='\n', file=fout)

100%|██████████| 246/246 [00:14<00:00,  4.91it/s]


# Switchboard

In [70]:
from nltk.corpus import switchboard

switchboard_meta = {'title':'Switchboard Corpus Sample',
            'description': str('Derived from "TalkBank Switchboard Corpus, Version 0.1"'
                           ),
            'readme': switchboard.readme(),
            'authors': 'David Graff & Steven Bird (2000)',
            'source': 'http://www.ldc.upenn.edu/Catalog/LDC93S7.html]',
            'cite': str("David Graff & Steven Bird (2000).  Many uses, many annotations for large "
                        "speech corpora: Switchboard and TDT as case studies.  Proceedings of the "
                        "Second International Conference on Language Resources and Evaluation, "
                        "pp. 427-433, Paris: European Language Resources Association, 2000. "
                        "http://arXiv.org/abs/cs/0007024"),
           }

directory = new_nltk_data+'/corpora/switchboard/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
with open(new_nltk_data+'/corpora/switchboard/switchboard-meta.json', 'w') as fout:
    json.dump(switchboard_meta, fout, sort_keys=True, indent=4, separators=(',', ': '))

In [125]:
with open(new_nltk_data +'/corpora/switchboard/switchboard-sample.tsv', 'w') as fout:
    for discourse_idx, (discourse, tagged_discourse) in enumerate(zip(switchboard.discourses(),switchboard.tagged_discourses())):
        for turn_idx, (turn, tagged_turn) in enumerate(zip(discourse, tagged_discourse)):
            text = re.search(f"^\<{turn.speaker}\.{turn.id}\: (.*)\>$", str(turn)).group(1)[1:-1]
            tagged_text = re.search(f"^\<{turn.speaker}\.{turn.id}\: (.*)\>$", str(tagged_turn)).group(1)[1:-1]
            tags = [token_tag.split('/')[-1] for token_tag in tagged_text.split(' ')]
            print('\t'.join(map(str, [discourse_idx, turn_idx, turn.speaker, turn.id, text, ' '.join(tags)])), 
                  end='\n', file=fout)

nltk.corpus.reader.switchboard.SwitchboardTurn

[[<A.1: 'Uh/UH ,/, do/VBP you/PRP have/VB a/DT pet/NN Randy/NNP ?/.'>, <B.2: 'Uh/UH ,/, yeah/UH ,/, currently/RB we/PRP have/VBP a/DT poodle/NN ./.'>, <A.3: 'A/DT poodle/NN ,/, miniature/JJ or/CC ,/, uh/UH ,/, full/JJ size/NN ?/.'>, <B.4: "Yeah/UH ,/, uh/UH ,/, it/PRP 's/BES ,/, uh/UH miniature/JJ ./.">, <A.5: 'Uh-huh/UH ./.'>, <B.6: 'Yeah/UH ./.'>, <A.7: 'I/PRP read/VBD somewhere/RB that/IN ,/, the/DT poodles/NNS is/VBZ one/CD of/IN the/DT ,/, the/DT most/RBS intelligent/JJ dogs/NNS ,/, uh/UH ,/, around/RB ./.'>, <B.8: "Well/UH ,/, um/UH ,/, I/PRP would/MD n't/RB ,/, uh/UH ,/, I/PRP definitely/RB would/MD n't/RB dispute/VB that/IN ,/, it/PRP ,/, it/PRP 's/BES actually/RB my/PRP$ wife/NN 's/POS dog/NN ,/, uh/UH ,/, I/PRP ,/, I/PRP became/VBD part/NN owner/NN six/CD months/NNS ago/RB when/WRB we/PRP got/VBD married/VBN ,/, but/CC ,/, uh/UH ,/, it/PRP ,/, uh/UH ,/, definitely/RB responds/VBZ to/IN ,/, uh/UH ,/, to/IN authority/NN and/CC ,/, I/PRP 've/VBP had/VBN dogs/NNS in/IN the/DT pas

In [106]:
import re 
s = str(turn)


'Uh , do you have a pet Randy ?'

In [81]:
dir(discourse[0])

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'id',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort',
 'speaker',
 'unicode_repr']

# Problem Reports

In [None]:
problem_reports_meta = {'title':'Problem Report Corpus',
            'description': str(
                           "3.3% of the COMTRANS data, distributed with permission.\n\n"
                           )+format_description,
            'authors': 'Reinhard Rapp',
            'source': 'http://www.fask.uni-mainz.de/user/rapp/comtrans/',
            'cite': '',
           }