## Import Libraries

In [1]:
# Libraries for parsing data
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import os

# Libraries for multiprocessing
import multiprocessing as mp
from multiprocessing import Pool

ModuleNotFoundError: No module named 'lxml'

## Load Data

In [6]:
# Set corpus to the folder of files you want to use
corpus = '/home/ec2-user/SageMaker/data/Police_or_Officer_ALL/'

# Read in files
input_files = os.listdir(corpus)

print("Loaded", len(input_files), "documents.")

Loaded 563599 documents.


## Specify Output File

Define the `output_file` variable to the desired save location and file name. This variable will be used at the end of the script to save the processed data.

In [7]:
# Modify output_file to desired save name
output_file = 'Convert_To_Dataframe_Police_all.csv'

## Check Total Cores

Check the total number of cores on your current device. The following multiprocessing portions will be using this variable.

In [8]:
# Check core count
num_cores = mp.cpu_count()
print(num_cores)

8


## Define Functions

In [9]:
# Function to strip html tags from text portion
def strip_html_tags(text):
    stripped = BeautifulSoup(text).get_text().replace('\n', ' ').replace('\\', '').strip()
    return stripped

In [10]:
# Retrieve metadata from XML document
def getxmlcontent(corpus, file, strip_html=True):
    try:
        tree = etree.parse(corpus + file)
        root = tree.getroot()

        if root.find('.//GOID') is not None:
            goid = root.find('.//GOID').text
        else:
            goid = None

        if root.find('.//Title') is not None:
            title = root.find('.//Title').text
        else:
            title = None

        if root.find('.//NumericDate') is not None:
            date = root.find('.//NumericDate').text
        else:
            date = None
            
        if root.find('.//PublisherName') is not None:
            publisher = root.find('.//PublisherName').text
        else:
            publisher = None

        if root.find('.//FullText') is not None:
            text = root.find('.//FullText').text

        elif root.find('.//HiddenText') is not None:
            text = root.find('.//HiddenText').text

        elif root.find('.//Text') is not None:
            text = root.find('.//Text').text

        else:
            text = None

        # Strip html from text portion
        if text is not None and strip_html == True:
            text = strip_html_tags(text)
    
    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
    
    return goid, title, date, publisher, text

In [11]:
# Function to make lists out of parsed data--on single document scale for multiprocessing
def make_lists(file):
    
    goid, title, date, publisher, text = getxmlcontent(corpus, file, strip_html=True)
    
    return goid, title,  text, date

## Run Multiprocessing to parse XML files

In [12]:
# Test function on single document
make_lists(input_files[10003])

('1812405219',
 'May 22, 1923 (Page 9 of 24)',
 'TIIE DETROIT FREE TRESS, TUESDAY," MAY 192 3. s JLjr .Li SCHOOL PICKED AS SOCIAL UNIT Welfare Conferees to Study Classroom Effect on ! 1 1 Society, JXmlt Free Pr Bureau. m MMrupulllu Kr. lld fir Kr I\'mi rrlTHI Lease Wire. Washington, Way 21. Tuesday will ba almost a Michigan day In the Sfftslons of the National Con-lerence Con-lerence of Social Work, which has been Dieetlng in Washington sine Wednesday last. Th Tuesday program la dovoted to "th school." and tha entire subject subject la under th direction of tt. Helen T. Wooh y. of Detroit, who la tli chairman of th conference for th da. . In on of th rroup meetings, rrofcinor Ouy Montros Whipple, ot the experimental education department department In the university school of education at Ann Arbor, will rend a paper on \'School Provision tor Gifted Children In the United States," while In a eecond group James I\'lunrald. executive secretary secretary of the Huelcty or St. Vincent Lie T

In [13]:
# When using multiple processes, important to eventually close them to avoid memory/resource leaks
try:
    # Define a thread Pool to process multiple XML files simultaneously
    # Default set to num_cores - 1, but may change number of processes depending on instance
    p = Pool(processes=num_cores-1)
    
    # Apply function with Pool to corpus
    processed_lists = p.map(make_lists, input_files)

except Exception as e:
    print(f"Error in processing document: {e}")
    
finally:
    p.close()

In [14]:
# Transform processed data into a dataframe
df = pd.DataFrame(processed_lists, columns=['GOID','Title', 'Text', 'Date'])

In [15]:
# View dataframe
df.head(1)

Unnamed: 0,GOID,Title,Text,Date
0,1815088244,"January 6, 1931 (Page 21 of 26)",THE PETROTT FREE PRESS TUESDAY. JANUARY 8. Hit...,1931-01-06


## Save Dataframe as CSV

Make sure to change the `output_file` variable (defined at the top of script) to desired output file name before running this cell.

In [16]:
# Save output to file
df.to_csv(output_file)

In [31]:
random_sample = df['GOID'].sample(n=150)

In [36]:
random_sample.to_csv("random_sample_output.txt", index=False)

In [37]:
data_to_export = "random_sample_output.txt"

In [38]:
!aws s3 cp $data_to_export s3://pq-tdm-studio-results/tdm-ale-data/a2535/results/

Completed 1.6 KiB/1.6 KiB (24.3 KiB/s) with 1 file(s) remaining
upload: ./random_sample_output.txt to s3://pq-tdm-studio-results/tdm-ale-data/a2535/results/random_sample_output.txt


## Attach Training Data


In [3]:
df_output = pd.read_csv('Convert_To_Dataframe_Police_all.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [25]:
df_train = pd.read_csv('training_data.csv')

In [26]:
df_train.drop_duplicates(inplace=True)

In [28]:
df_merged = pd.merge(df_output, df_train, on='GOID', how='inner')

In [30]:
df_merged.head()

Unnamed: 0.1,Unnamed: 0,GOID,Title,Text,Date,Class
0,411,1820265981,"January 2, 1972 (Page 2 of 178)","2-A Sunday, Jan. 2, '72 DETROIT FREE PRESS 197...",1972-01-02,1
1,4088,1813612520,"May 16, 1925 (Page 10 of 22)",12 THE DETROIT FR E PRESS. SAT l J.JA . m a i ...,1925-05-16,0
2,9596,1821803197,"June 21, 1979 (Page 12 of 72)","1A ctwir vr.it F'Kc:3TNUP?DAy, ju?.t :i, 1973 ...",1979-06-21,0
3,11521,1820095901,"September 22, 1971 (Page 3 of 52)",Free Press Telephones it jfaee Today's Chuckle...,1971-09-22,1
4,13087,1822204708,"July 4, 1982 (Page 41 of 202)","DETROIT FREE PRESSSUNDAY, JULY 4, 1982 11D Riv...",1982-07-04,0


In [31]:
df_merged['Class'].value_counts()

1    76
0    65
Name: Class, dtype: int64

In [32]:
df_merged.to_csv('training_data_merged.csv')