# Cleaning text collected from Wikipedia

In [1]:
import pandas as pd
from _datetime import date
import nltk.tokenize as nt
import os
import re
# Update to improve sentences segmentation
#import spacy library
import spacy
#load core english library
nlp = spacy.load("en_core_web_sm")

### 1. Cleaning text

In [3]:
# Read all files in sample folder
# return a list object of files in the given folder
files_list = [f for f in os.listdir('text_dataset') if not f.startswith('.')]
# parse to dataframe
df_files = pd.DataFrame(files_list, columns=['file_name'])
# df_files = df_files.query("file_name=='10085.txt'")
df_files.to_csv('totalBiographies.csv',index=False)

df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37155 entries, 0 to 37154
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  37155 non-null  object
dtypes: object(1)
memory usage: 290.4+ KB


Unnamed: 0,file_name
0,10002116.txt
1,1000228.txt
2,10004137.txt
3,1000522.txt
4,1000539.txt


In [5]:
def read_text(filename,folder):
    fileObject = open(folder+filename, "r")
    data = fileObject.readlines()
    return data

# save the file after cleaning
def writeTextFile(directory, filename, content):
    f = open(directory+filename, "w")
    f.writelines(content)
    f.close()

# cleaning text, get rid of blank lines and section "==See also"
# receives a paragraph
def cleanText(text):
    ctext=[]
    for line in text:
        if "== See also ==" in line:
            break
        if not len(line.strip()) < 1 :
            ctext.append(re.sub(r'([^\.]\.)([A-Z][^\.,;:])', r'\1 \2', line))
    return ctext

def identifySection(txt):
    '''
    args: text, the paragraph to identify
    return: section and subsection
    '''
    typeSection = 0
    if txt.startswith("==", 0, 2):
        typeSection = 1
    if txt.startswith("===", 0, 3):
        typeSection = 2
    
    return typeSection

In [9]:
df = pd.DataFrame()

# for biography_file in df_files.itertuples():
for chunk in pd.read_csv('totalBiographies.csv', chunksize=1000):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    for df_row in df_file_name.itertuples():
        # Adding a control to check if a file already exist    
        # read text
        text = read_text(df_row.file_name,"text_dataset/")
        # clean empty lines
        clean_text= cleanText(text)
        # print number of paragraphs
        # print(len(clean_text))
        # save the file after cleaning
        writeTextFile('cleanText/', df_row.file_name,clean_text)

### 2. Adding index

In [11]:
def identifySection(txt):
    '''
    args: text, the paragraph to identify
    return: section and subsection
    '''
    typeSection = 0
    if txt.startswith("==", 0, 2):
        typeSection = 1
    if txt.startswith("===", 0, 3):
        typeSection = 2
    
    return typeSection

# used to extract the sections and subsections
def setSection(biography_df):# row section
    sectionTitle_list = []
    lastTitle = ""
    sectionTitle = ""
    # for each paragraph in the dataframe
    for prg in biography_df.itertuples():
        #return the type of section: level 0, level 1 (belongs to previous section)
        typeSection = identifySection(prg.paragraph)
        if typeSection==0 and lastTitle == "":
            sectionTitle = "N/A"
        elif typeSection==1:
            lastTitle=prg.paragraph.replace('\n', '')
            # sectionTitle = prg.paragraph
            sectionTitle = lastTitle
        elif typeSection==2:
            sectionTitle = lastTitle+" | "+prg.paragraph.replace('\n', '')
        # add to a list
        sectionTitle_list.append(sectionTitle)
    return sectionTitle_list
    # row subsection

In [12]:
# Structure: biography as a collection sections, sections as a collection of paragraphs
## and paragraphs as a collection of sentences
# Biography <- Section (s) <- Paragraph(s) <- Sentence(s)
# for text_file_name in df_files.itertuples():
for chunk in pd.read_csv('totalBiographies.csv', chunksize=500):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    for text_file_name in df_file_name.itertuples():
        df_coded_result = pd.DataFrame()

        #read text
        text = read_text(text_file_name.file_name,"cleanText/")
        biography_df = pd.DataFrame({'paragraph':text})
        #index number
        biography_df['paragraphIndex'] = list(range(len(biography_df)))

        section_list = setSection(biography_df)
        #section name
        biography_df['section'] = section_list
        # save information
        biography_df.to_csv('indexedParagraphs/'+text_file_name.file_name.replace(".txt","")+'.csv',index=False)

        # for each paragraph in the biography
        for paragraph_row in biography_df.itertuples():
            # divide the paragraph into sentences
            # ss=nt.sent_tokenize(paragraph_row.paragraph)
            # biography_df_per_sent = pd.DataFrame({'sentences':ss})
            # UPDATE: improve sentences segmentation
            # print(paragraph_row.paragraph)
            ss = nlp(u"{}".format(paragraph_row.paragraph.strip()))
            nlp_text = [sent.text.strip() for sent in ss.sents]
            biography_df_per_sent = pd.DataFrame({'sentences':nlp_text})
            #
            # add an index for sentences
            biography_df_per_sent['sentenceIndex'] = list(range(len(biography_df_per_sent)))
            biography_df_per_sent['paragraphIndex'] = paragraph_row.paragraphIndex
            biography_df_per_sent['section'] = paragraph_row.section
            biography_df_per_sent['wikiId'] = text_file_name.file_name.replace(".txt","")

            df_coded_result = df_coded_result.append(biography_df_per_sent)

        df_coded_result.to_csv('indexedSentences/'+text_file_name.file_name.replace(".txt","")+'.csv',index=False)