# Text Models

* **Name:** Aldo Barriente
* **Course:** DS 5001
* **Instructor:** Professor Rafael Alvarado

## Set up

In [14]:
data_in = './data_in'
data_out = './data_out'
data_prefix = 'zapatistas'
OHCO = ['title_name', 'section_num', 'para_num', 'sent_num', 'token_num']

In [15]:
import pandas as pd
import re

In [16]:
lines = ''
with open(f'{data_in}/{data_prefix}-ch1.txt', 'r', encoding='utf8') as chap_one:
    lines = chap_one.readlines()

## Creating dataframe

In [17]:
text = pd.DataFrame(lines, columns=['line_str'])
text.columns = ['line_str']
text.index.name = 'line_num'
text.line_str = text.line_str.str.strip()

## Extracting title and removing cruft 

In [18]:
begin = text.line_str.str.match(r"Chapter \d+:")
end = text.line_str.str.match(r"LANIC Home | Zapatistas!")

In [19]:
begin_index = text.loc[begin].index[0]
end_index = text.loc[end].index[0]

In [20]:
text = text.loc[begin_index - 1 : end_index - 1]

In [21]:
title = text.loc[begin_index].line_str

In [22]:
title = re.sub(r'Chapter \d+:\s', '', title).strip()

## Chunking and group by title

In [23]:
text[OHCO[:1]] = title

In [24]:
text.loc[100]

line_str      Chapter 1: The Revolt
title_name               The Revolt
Name: 100, dtype: object

## Chunking by section

In [39]:
section_lines = text.line_str.str.match(r"$")

In [40]:
text.loc[section_lines]

Unnamed: 0_level_0,line_str,title_name
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
99,,The Revolt
101,,The Revolt
206,,The Revolt
248,,The Revolt
249,,The Revolt
251,,The Revolt
285,,The Revolt
309,,The Revolt
332,,The Revolt
351,,The Revolt


In [41]:
section_num = [i+1 for i in range(text.loc[section_lines].shape[0])]

In [42]:
text.loc[section_lines, 'section_num'] = section_num

In [44]:
text.loc[section_lines]

Unnamed: 0_level_0,line_str,title_name,section_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
99,,The Revolt,1.0
101,,The Revolt,2.0
206,,The Revolt,3.0
248,,The Revolt,4.0
249,,The Revolt,5.0
251,,The Revolt,6.0
285,,The Revolt,7.0
309,,The Revolt,8.0
332,,The Revolt,9.0
351,,The Revolt,10.0


In [45]:
text.section_num = text.section_num.ffill()

In [47]:
text.loc[240:260]

Unnamed: 0_level_0,line_str,title_name,section_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
240,against our class enemies.,The Revolt,3.0
241,In this first issue we present our Declaration...,The Revolt,3.0
242,"Army, and we publish the orders to be followed...",The Revolt,3.0
243,of the EZLN in our advance through national te...,The Revolt,3.0
244,are the Revolutionary Laws that will be enacte...,The Revolt,3.0
245,in order to guarantee their revolutionary cont...,The Revolt,3.0
246,so we can begin the process of building a new ...,The Revolt,3.0
247,To live for our country or die for freedom.,The Revolt,3.0
248,,The Revolt,4.0
249,,The Revolt,5.0


### Section number clean up

In [48]:
text = text.loc[~text.section_num.isna()]
text = text.loc[~section_lines]
text.section_num = text.section_num.astype('int')

## Grouping lines by title and section number

In [52]:
text_section = text.groupby(OHCO[:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame()
text_section['line_str'] = text_section.line_str.str.strip()

In [53]:
text_section.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,line_str
title_name,section_num,Unnamed: 2_level_1
The Revolt,1,Chapter 1: The Revolt
The Revolt,2,[The Mexican Awakener [El Despertador Mexicano...
The Revolt,3,"Editorial\nMexicans: workers, campesinos, stud..."
The Revolt,5,Revolutionary Laws
The Revolt,6,Women's Revolutionary Law\nIn the just fight f...
The Revolt,7,Urban Reform Law\nIn the urban zones controlle...
The Revolt,8,Labor Law: Additions to the Present Law\nThe f...
The Revolt,9,Industry and Commerce Law\nFirst: The prices o...
The Revolt,10,Social Security Law\nFirst: Abandoned children...
The Revolt,11,Justice Law\nFirst: All prisoners in all priso...


## Splitting into paragraphs

In [57]:
text_para = text_section['line_str'].str.split(r'\n\n+', expand=True).stack().to_frame().rename(columns={0:'para_str'})
text_para.index.names = OHCO[:3]

In [58]:
text_para.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
title_name,section_num,para_num,Unnamed: 3_level_1
The Revolt,1,0,Chapter 1: The Revolt
The Revolt,2,0,[The Mexican Awakener [El Despertador Mexicano...
The Revolt,3,0,"Editorial\nMexicans: workers, campesinos, stud..."
The Revolt,5,0,Revolutionary Laws
The Revolt,6,0,Women's Revolutionary Law\nIn the just fight f...


In [59]:
text_para['para_str'] = text_para['para_str'].str.replace(r'\n', ' ').str.strip()
text_para = text_para[~text_para['para_str'].str.match(r'^\s*$')]

In [60]:
text_para.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
title_name,section_num,para_num,Unnamed: 3_level_1
The Revolt,1,0,Chapter 1: The Revolt
The Revolt,2,0,[The Mexican Awakener [El Despertador Mexicano...
The Revolt,3,0,"Editorial Mexicans: workers, campesinos, stude..."
The Revolt,5,0,Revolutionary Laws
The Revolt,6,0,Women's Revolutionary Law In the just fight fo...
The Revolt,7,0,Urban Reform Law In the urban zones controlled...
The Revolt,8,0,Labor Law: Additions to the Present Law The fo...
The Revolt,9,0,Industry and Commerce Law First: The prices of...
The Revolt,10,0,Social Security Law First: Abandoned children ...
The Revolt,11,0,Justice Law First: All prisoners in all prison...


## Splitting into sentences

In [61]:
sent_pat = r'[.?!;:"]+'
text_sent = text_para['para_str'].str.split(sent_pat, expand=True)\
    .stack().to_frame().rename(columns={0:'sent_str'})
text_sent.index.names = OHCO[:4]
text_sent = text_sent[~text_sent['sent_str'].str.match(r'^\s*$')]

In [62]:
text_sent.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
title_name,section_num,para_num,sent_num,Unnamed: 4_level_1
The Revolt,1,0,0,Chapter 1
The Revolt,1,0,1,The Revolt
The Revolt,2,0,0,[The Mexican Awakener [El Despertador Mexicano...
The Revolt,2,0,1,It was the first document released by the Zap...
The Revolt,2,0,2,"It contains their Declaration of War, an edit..."
The Revolt,2,0,3,] El Despertador Mexicano Declaration of War [...
The Revolt,2,0,4,TO THE PEOPLE OF MEXICO
The Revolt,2,0,5,MEXICAN BROTHERS AND SISTERS
The Revolt,2,0,6,We are the product of 500 years of struggle
The Revolt,2,0,7,"first against slavery, then during the War of..."


## Splitting into tokens

In [63]:
token_pat = r"[\s',-]+"
text_token = text_sent['sent_str'].str.split(token_pat, expand=True)\
    .stack().to_frame().rename(columns={0:'token_str'})
text_token.index.names = OHCO[:5]

In [64]:
text_token.sample(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
title_name,section_num,para_num,sent_num,token_num,Unnamed: 5_level_1
The Revolt,2,0,18,32,Article
The Revolt,14,0,2,19,of
The Revolt,2,0,11,10,our
The Revolt,2,0,10,4,work
The Revolt,11,0,1,18,of
The Revolt,12,0,29,17,product
The Revolt,19,0,215,25,you
The Revolt,12,0,5,8,agro/livestock
The Revolt,3,0,4,2,hundreds
The Revolt,15,0,30,20,goods
