# F2 - NLP
* **Name:** Aldo Barriente
* **Course:** DS 5001
* **Instructor:** Professor Rafael Alvarado

## Set up

In [34]:
data_in = './data_in'
data_out = './data_out'

In [35]:
OHCO = ['text_id', 'section_num', 'para_num', 'sent_num', 'token_num']

In [36]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk

## Inspecting texts

In [37]:
caps = "[A-Z';, -]+"
sec_pats = {
    1: {
        'start_line': 100, 
        'end_line': 1265,
        'section': re.compile('Section \d+'),
    },
    2: {
        'start_line': 100,
        'end_line': 1267,
        'section': re.compile('Section \d+'),
    },
    3: {
        'start_line': 100,
        'end_line': 1632,
        'section': re.compile('Section \d+'),
    },
    4: {
        'start_line': 100,
        'end_line': 1043,
        'section': re.compile('Section \d+'),
    },
    5: {
        'start_line': 100,
        'end_line': 1578,
        'section': re.compile('Section \d+'),
    },
    6: {
        'start_line': 100,
        'end_line': 831,
        'section': re.compile('Section \d+'),
    },
    7: {
        'start_line': 100,
        'end_line': 1576,
        'section': re.compile('Section \d+'),
    },
    8: {
        'start_line': 100,
        'end_line': 1870,
        'section': re.compile('Section \d+'),
    },
    9: {
        'start_line': 100,
        'end_line': 1000,
        'section': re.compile('Section \d+'),
    },
    10: {
        'start_line': 100,
        'end_line': 915,
        'section': re.compile('Section \d+'),
    },
    11: {
        'start_line': 100,
        'end_line': 2074,
        'section': re.compile('Section \d+'),
    },
    12: {
        'start_line': 100,
        'end_line': 1235,
        'section': re.compile('Section \d+'),
    },
}

## Registering and chunking

In [38]:
def acquire_epubs(epub_list, sec_pats, OHCO=OHCO):
    
    my_lib = []
    my_doc = []

    for epub_file in epub_list:
        
        # Get PG ID from filename
        text_id = int(epub_file.split('-')[1].split('.')[0][2:])
        print("TEXT ID", text_id)
        
        # Import file as lines
        lines = open(epub_file, 'r', encoding='utf-8-sig').readlines()
        df = pd.DataFrame(lines, columns=['line_str'])
        df.index.name = 'line_num'
        df.line_str = df.line_str.str.strip()
        df['text_id'] = text_id
        
        # FIX CHARACTERS TO IMPROVE TOKENIZATION
        df.line_str = df.line_str.str.replace('—', ' — ')
        df.line_str = df.line_str.str.replace('-', ' - ')
        
        # Get book title and put into LIB table -- note problems, though
        text_title = df.loc[sec_pats[text_id]['start_line']].line_str
        text_title = re.sub(r"Chapter \d+:\s", "", text_title).strip()
        
        # Remove cruft
        a = sec_pats[text_id]['start_line'] - 1
        b = sec_pats[text_id]['end_line'] + 1
        df = df.iloc[a:b]
        
        # Chunk by chapter
        section_lines = df.line_str.str.match(sec_pats[text_id]['section'])
        section_nums = [i+1 for i in range(df.loc[section_lines].shape[0])]
        df.loc[section_lines, 'section_num'] = section_nums
        df.section_num = df.section_num.ffill()

        # Clean up
#         df = df[~df.chap_num.isna()] 
        df = df.dropna(subset=['section_num']) # Remove everything before Chapter 1
        df = df.loc[~section_lines] # Remove chapter heading lines
        df['section_num'] = df['section_num'].astype('int')
        
        # Group -- Note that we exclude the book level in the OHCO at this point
        df = df.groupby(OHCO[1:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
        
        # Split into paragrpahs
        df = df['line_str'].str.split(r'\n\n+', expand=True).stack().to_frame().rename(columns={0:'para_str'})
        df.index.names = OHCO[1:3] # MAY NOT BE NECESSARY UNTIL THE END
        df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
        df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        
        # Set index
        df['text_id'] = text_id
        df = df.reset_index().set_index(OHCO[:3])

        # Register
        my_lib.append((text_id, text_title, epub_file))
        my_doc.append(df)

    docs = pd.concat(my_doc)
    library = pd.DataFrame(my_lib, columns=['text_id', 'text_title', 'text_file']).set_index('text_id')
    return library, docs

In [39]:
epubs = [epub for epub in sorted(glob(f'{data_in}//*.txt'))]

In [40]:
LIB, DOC = acquire_epubs(epubs, sec_pats)

TEXT ID 1
TEXT ID 10
TEXT ID 11
TEXT ID 12
TEXT ID 2
TEXT ID 3
TEXT ID 4
TEXT ID 5
TEXT ID 6
TEXT ID 7
TEXT ID 8
TEXT ID 9


In [41]:
LIB

Unnamed: 0_level_0,text_title,text_file
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Revolt,./data_in\zapatistas-ch1.txt
10,Returning to the Communities,./data_in\zapatistas-ch10.txt
11,The Consultations,./data_in\zapatistas-ch11.txt
12,Rejecting the Government's Offer,./data_in\zapatistas-ch12.txt
2,The First Days,./data_in\zapatistas-ch2.txt
3,The Cease - Fire,./data_in\zapatistas-ch3.txt
4,Solidarity,./data_in\zapatistas-ch4.txt
5,Broadening the Struggle,./data_in\zapatistas-ch5.txt
6,Building Ties,./data_in\zapatistas-ch6.txt
7,Before the Dialogue,./data_in\zapatistas-ch7.txt


In [42]:
DOC.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
text_id,section_num,para_num,Unnamed: 3_level_1
1,1,0,[The Mexican Awakener [El Despertador Mexicano...
1,1,1,El Despertador Mexicano Declaration of War
1,1,2,[from a photocopy of the original]
1,1,3,"Lacandona Jungle, December 31, 1993"
1,1,4,TODAY WE SAY ENOUGH IS ENOUGH! TO THE PEOPLE O...
1,1,5,We are the product of 500 years of struggle: f...
1,1,6,"But today, we say ENOUGH IS ENOUGH. We are the..."
1,1,7,"To prevent the continuation of the above, and ..."
1,1,8,"""National Sovereignty essentially and original..."
1,1,9,"Therefore, according to our Constitution, we d..."


## TOKEN

In [43]:
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):
    
    # Paragraphs to Sentences
    df = doc_df.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    
    # Sentences to Tokens
    # Local function to pick tokenizer
    def word_tokenize(x):
        if ws:
            s = pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
        else:
            s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))
        return s
            
    df = df.sent_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df

In [44]:
TOKEN = tokenize(DOC, ws=True)

In [49]:
TOKEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
text_id,section_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0,0,"([The, JJ)",JJ,[The,the
1,1,0,0,1,"(Mexican, NNP)",NNP,Mexican,mexican
1,1,0,0,2,"(Awakener, NNP)",NNP,Awakener,awakener
1,1,0,0,3,"([El, NNP)",NNP,[El,el
1,1,0,0,4,"(Despertador, NNP)",NNP,Despertador,despertador
...,...,...,...,...,...,...,...,...
9,8,13,2,3,"(if, IN)",IN,if,if
9,8,13,2,4,"(you, PRP)",PRP,you,you
9,8,13,2,5,"(can., VBP)",VBP,can.,can
9,8,13,3,0,"(All, DT)",DT,All,all


## VOCAB

In [45]:
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '')

In [46]:
VOCAB = TOKEN.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [47]:
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')

In [48]:
VOCAB

Unnamed: 0_level_0,term_str,n,num
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,1033,0
1,000001,1,1
2,001,1,1
3,002,1,1
4,029,1,1
...,...,...,...
8136,zonethis,1,0
8137,zoque,2,0
8138,zócalo,5,0
8139,zócalos,1,0


In [50]:
pos_counts = TOKEN.groupby(['term_str','pos']).pos.count().unstack().fillna(0)
VOCAB = VOCAB.reset_index().set_index('term_str')
VOCAB['pos_max'] = pos_counts.idxmax(1)
VOCAB = VOCAB.reset_index().set_index('term_id')

In [51]:
VOCAB

Unnamed: 0_level_0,term_str,n,num,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,1033,0,:
1,000001,1,1,CD
2,001,1,1,CD
3,002,1,1,CD
4,029,1,1,CD
...,...,...,...,...
8136,zonethis,1,0,NN
8137,zoque,2,0,NNP
8138,zócalo,5,0,NNP
8139,zócalos,1,0,NN


In [52]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [53]:
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [54]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(stemmer.stem)

In [55]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_str,n,num,pos_max,stop,p_stem
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2985,finding,9,0,VBG,0,find
1189,carranza,4,0,NNP,0,carranza
195,acceptance,1,0,NN,0,accept
2219,dilemma,1,0,NN,0,dilemma
7363,thousand,6,0,CD,0,thousand
6977,still,91,0,RB,0,still
5237,pact,7,0,NN,0,pact
6745,skilled,1,0,JJ,0,skill
3303,grabbing,1,0,VBG,0,grab
1170,captive,1,0,JJ,0,captiv


In [56]:
VOCAB.groupby('p_stem').size().sort_values(ascending=False).to_frame().reset_index().rename(columns={0:'count'})

Unnamed: 0,p_stem,count
0,commun,12
1,gener,9
2,initi,8
3,calcul,7
4,collect,7
...,...,...
5343,myself,1
5344,myriad,1
5345,my,1
5346,mutil,1
