# F2 - NLP
* **Name:** Aldo Barriente
* **Course:** DS 5001
* **Instructor:** Professor Rafael Alvarado

## Set up

In [1]:
data_in = './data_in'
data_out = './data_out'

In [15]:
OHCO = ['text_id', 'section_num', 'para_num', 'sent_num', 'token_num']

In [4]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk

## Inspecting texts

In [6]:
caps = "[A-Z';, -]+"
sec_pats = {
    1: {
        'start_line': 100, 
        'end_line': 1034,
        'section': re.compile('$'),
    },
    2: {
        'start_line': 100,
        'end_line': 1048,
        'section': re.compile('$'),
    },
    3: {
        'start_line': 100,
        'end_line': 1306,
        'section': re.compile('$'),
    },
    4: {
        'start_line': 100,
        'end_line': 875,
        'section': re.compile('$'),
    },
    5: {
        'start_line': 100,
        'end_line': 1296,
        'section': re.compile('$'),
    },
    6: {
        'start_line': 100,
        'end_line': 715,
        'section': re.compile('$'),
    },
    7: {
        'start_line': 100,
        'end_line': 1257,
        'section': re.compile('$'),
    },
    8: {
        'start_line': 100,
        'end_line': 1564,
        'section': re.compile('$'),
    },
    9: {
        'start_line': 100,
        'end_line': 841,
        'section': re.compile('$'),
    },
    10: {
        'start_line': 100,
        'end_line': 718,
        'section': re.compile('$'),
    },
    11: {
        'start_line': 100,
        'end_line': 1758,
        'section': re.compile('$'),
    },
    12: {
        'start_line': 100,
        'end_line': 1005,
        'section': re.compile('$'),
    },
}

## Registering and chunking

In [34]:
def acquire_epubs(epub_list, sec_pats, OHCO=OHCO):
    
    my_lib = []
    my_doc = []

    for epub_file in epub_list:
        
        # Get PG ID from filename
        text_id = int(epub_file.split('-')[1].split('.')[0][2:])
        print("TEXT ID", text_id)
        
        # Import file as lines
        lines = open(epub_file, 'r', encoding='utf-8-sig').readlines()
        df = pd.DataFrame(lines, columns=['line_str'])
        df.index.name = 'line_num'
        df.line_str = df.line_str.str.strip()
        df['text_id'] = text_id
        
        # FIX CHARACTERS TO IMPROVE TOKENIZATION
        df.line_str = df.line_str.str.replace('—', ' — ')
        df.line_str = df.line_str.str.replace('-', ' - ')
        
        # Get book title and put into LIB table -- note problems, though
        text_title = df.loc[sec_pats[text_id]['start_line']].line_str
        text_title = re.sub(r"Chapter \d+:\s", "", text_title).strip()
        
        # Remove cruft
        a = sec_pats[text_id]['start_line'] - 1
        b = sec_pats[text_id]['end_line'] + 1
        df = df.iloc[a:b]
        
        # Chunk by chapter
        section_lines = df.line_str.str.match(sec_pats[text_id]['section'])
        section_nums = [i+1 for i in range(df.loc[section_lines].shape[0])]
        df.loc[section_lines, 'section_num'] = section_nums
        df.section_num = df.section_num.ffill()

        # Clean up
#         df = df[~df.chap_num.isna()] 
        df = df.dropna(subset=['section_num']) # Remove everything before Chapter 1
        df = df.loc[~section_lines] # Remove chapter heading lines
        df['section_num'] = df['section_num'].astype('int')
        
        # Group -- Note that we exclude the book level in the OHCO at this point
        df = df.groupby(OHCO[1:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
        
        # Split into paragrpahs
        df = df['line_str'].str.split(r'\n\n+', expand=True).stack().to_frame().rename(columns={0:'para_str'})
        df.index.names = OHCO[1:3] # MAY NOT BE NECESSARY UNTIL THE END
        df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
        df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        
        # Set index
        df['text_id'] = text_id
        df = df.reset_index().set_index(OHCO[:3])

        # Register
        my_lib.append((text_id, text_title, epub_file))
        my_doc.append(df)

    docs = pd.concat(my_doc)
    library = pd.DataFrame(my_lib, columns=['text_id', 'text_title', 'text_file']).set_index('text_id')
    return library, docs

In [9]:
epubs = [epub for epub in sorted(glob(f'{data_in}//*.txt'))]

In [35]:
LIB, DOC = acquire_epubs(epubs, sec_pats)

TEXT ID 1
TEXT ID 10
TEXT ID 11
TEXT ID 12
TEXT ID 2
TEXT ID 3
TEXT ID 4
TEXT ID 5
TEXT ID 6
TEXT ID 7
TEXT ID 8
TEXT ID 9


In [36]:
LIB

Unnamed: 0_level_0,text_title,text_file
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Revolt,./data_in\zapatistas-ch1.txt
10,Returning to the Communities,./data_in\zapatistas-ch10.txt
11,The Consultations,./data_in\zapatistas-ch11.txt
12,Rejecting the Government's Offer,./data_in\zapatistas-ch12.txt
2,The First Days,./data_in\zapatistas-ch2.txt
3,The Cease - Fire,./data_in\zapatistas-ch3.txt
4,Solidarity,./data_in\zapatistas-ch4.txt
5,Broadening the Struggle,./data_in\zapatistas-ch5.txt
6,Building Ties,./data_in\zapatistas-ch6.txt
7,Before the Dialogue,./data_in\zapatistas-ch7.txt


In [38]:
DOC.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
text_id,section_num,para_num,Unnamed: 3_level_1
1,1,0,Chapter 1: The Revolt
1,2,0,[The Mexican Awakener [El Despertador Mexicano...
1,3,0,"Editorial Mexicans: workers, campesinos, stude..."
1,5,0,Revolutionary Laws
1,6,0,Women's Revolutionary Law In the just fight fo...
1,7,0,Urban Reform Law In the urban zones controlled...
1,8,0,Labor Law: Additions to the Present Law The fo...
1,9,0,Industry and Commerce Law First: The prices of...
1,10,0,Social Security Law First: Abandoned children ...
1,11,0,Justice Law First: All prisoners in all prison...
