In [1]:
# Imports
import yaml # Converts long string to list
import ast # Convert stringified list to list

import os

import pandas as pd
import numpy as np
from IPython.display import display

# Webscraping
import requests as req
import time

# Extract text from dif file types
import doc2text

# Text preprocessing
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')
nltk.download('wordnet')

# Instantiate stopWords
stopWords = stopwords.words("english")
    
# Instantiate wordnet lemmatizer
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Functions

file_exts = (".pdf", ".jpg", ".jpeg", ".png", ".doc", ".docx")

"""
Desc: Find all links that are files
Input: HTML text as string
Output: List of urls with files
"""
def extract_files(html):
    
    # Convert string to beautiful soup
    soup = BeautifulSoup(html)

    # Empty list to store files found
    files = []

    # For each link
    for link in soup.find_all('a', href=True):
        
         # Check that link is not none type
        if link['href'] != None:
        
            # If any of the urls end with a desired extension, add it to files list
            if link['href'].lower().endswith(file_exts):
                files.append(link['href'])

    return files


"""
Desc: Lemmatization
Input: text (string) - text to be lemmatized
Output: text (string) - lemmatized text
"""
def lemmatize(text):
    
    # Lemmatize the text
    text = ' '.join([wn.lemmatize(word) for word in text.split(' ')])
    
    return text


"""
Desc: Download files from url
Input: url (string), fpath (string): filepath of downloaded content
Output: file downloaded into specified folder or error message
"""
def dl_from_url(url, fpath):
    
    try:
        
        r = req.get(url, stream = True)
        
        # Download file
        with open(fpath, "wb") as file:
            
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)   
                
    except:
        print(url, "could not be downloaded.")
        
        
"""
Desc: Extract text from files
Input: fpath (string): filepath of downloaded content
Output: text (string): all text from each file concatenated into a single string
"""
def extract_text(fpath):
    
    # Empty list to store text
    text = []
    
    if fpath.endswith((".jpg", ".jpeg", ".png")):
        
        # Insert function that loads and extracts text from image
        text.append(doc2text.img2text(fpath))

    elif fpath.endswith(".pdf"):
        
        # Insert function that loads and extracts text from pdf
        text.append(doc2text.pdf2text(fpath))

    elif fpath.endswith((".doc", ".docx")):
        
        # Insert function that loads and extracts text from doc(x)
        text.append(doc2text.docx2text(fpath))
        
    else: # It isn't any of the file extensions
        
        # Keeps the rows aligned
        text.append("")
    
    text = " ".join(text)

    return text

In [3]:
# Load data as a single string
with open('../../output/kingston_body.json', 'r') as file:
    data = file.read().replace('\n', '')
    
# Check type of data
print(type(data))
print(len(data))

<class 'str'>
1027530579


In [4]:
# Remove duplicated commas
data = re.sub(re.compile(r'(,){2,}'), ",", data)

print(len(data))

1027524580


In [5]:
# Convert the string to list
data = yaml.load(data, Loader=yaml.CLoader)

print(len(data))
print(type(data))
print(type(data[0]))

10337
<class 'list'>
<class 'dict'>


In [6]:
# Convert list of dict into DataFrame
df = pd.DataFrame(data)
                  
# Display
df.head()

Unnamed: 0,url,status,lang,updated,text
0,https://www.cityofkingston.ca/,200,[],[],b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,https://www.cityofkingston.ca/residents/commun...,200,[],[],b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
2,https://www.cityofkingston.ca/residents/commun...,200,[],[],b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,https://www.cityofkingston.ca/residents/commun...,200,[],[],b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
4,https://www.cityofkingston.ca/residents/commun...,200,[],[],b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...


In [7]:
# Check basic information
print(df['status'].unique())
print(df["lang"].apply(lambda x: 1 if len(x) == 0 else 0).sum())
print(df["updated"].apply(lambda x: 1 if len(x) == 0 else 0).sum())

# From output, this means these columns are not useful
# Drop these columns
df = df[["url", "text"]]

# Check
df.head()

[200]
10337
10337


Unnamed: 0,url,text
0,https://www.cityofkingston.ca/,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
2,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
4,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...


In [8]:
# Extract desired files from text
df['files'] = df['text'].apply(lambda x: extract_files(x))

# Check
df.head()

Unnamed: 0,url,text,files
0,https://www.cityofkingston.ca/,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[]
1,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[]
2,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[]
3,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[]
4,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[]


In [9]:
# Check how many urls had no files
print(df["files"].apply(lambda x: 1 if len(x) > 0 else 0).sum())

76


In [10]:
# Compare length with strip true vs not
print(len(BeautifulSoup(df.text[0]).get_text()))
print(len(BeautifulSoup(df.text[0]).get_text(strip=True)))

104233
75001


In [11]:
# Filter out any rows that do not contain base url
baseUrl = "https://www.cityofkingston.ca"

df = df[df['url'].str.contains(baseUrl)]

# Check
print(len(df.index))

10335


In [12]:
# Extract files

# Explode
df = df.explode('files')

# Filter for rows with files
files = df[~df["files"].isna()]

# Check length
print(len(files.index))

# Drop any duplicates
files = files.drop_duplicates()

# Reset index
files = files.reset_index(drop=True)

# Check length
print(len(files.index))
display(files)

430
425


Unnamed: 0,url,text,files
0,https://www.cityofkingston.ca/resident/communi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.mcscs.jus.gov.on.ca/sites/default/...
1,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,/documents/10180/36972598/City-Council_Meeting...
2,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
3,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
4,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
...,...,...,...
420,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,/documents/26467239/38278041/Appendix O - Stor...
421,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,/documents/26467239/38278041/Appendix P - Memo...
422,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,/documents/26467239/38278041/Appendix Q - DRAF...
423,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,/documents/26467239/38278041/Appendix R - Enga...


In [13]:
# Some urls are missing base url, add it back
files["files"] = [baseUrl + x if x.startswith('/') else x for x in files["files"]]

# Remove items that do not contain baseurl (some appear to have made it pass scrapy)
files = files[files['files'].str.contains(baseUrl)]

# Lowercase all the urls
files["files"] = files["files"].str.lower()

# Get unique files
files = files.drop_duplicates(ignore_index=True)

# Check
print(len(files))
display(files)

367


Unnamed: 0,url,text,files
0,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
1,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
2,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
3,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
4,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...
...,...,...,...
362,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...
363,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...
364,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...
365,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...


In [14]:
# Get the file extenstions for each url
file_exts = files["files"].str.split(".").str[-1].unique()

# Append . to items
file_exts = ["." + ext for ext in file_exts]

print(file_exts)
# There are only pdfs and pngs on the kingston website

['.pdf', '.png']


In [15]:
# Create folders to store files and separate files by their extensions
if any(x in [".jpg", ".jpeg", ".png"] for x in file_exts):
    
    # Create image folder if it doesn't exist
    if not os.path.exists("./images/"):
        os.mkdir("./images/")
    
if ".pdf" in file_exts:
    
    # Create pdf folder if it doesn't exist
    if not os.path.exists("./pdf/"):
        os.mkdir("./pdf/")
    
if any(x in [".doc", ".docx"] for x in file_exts):
    
    # Create image folder if it doesn't exist
    if not os.path.exists("./doc/"):
        os.mkdir("./doc/")

In [16]:
# Counter variables (starting with index 1)
images = 1
pdfs = 1
docs = 1

# Empty list to store saved file name of the downloaded content
file_name = []

# Download all files (Note that some files have been archived by the website)
for f in files["files"]:

    if f.endswith(".jpg"):
        dl_from_url(f, "./images/image{}.jpg".format(images))
        
        file_name.append("./images/image{}.jpg".format(images))
        
        images += 1
        
    elif f.endswith(".jpeg"):
        dl_from_url(f, "./images/image{}.jpeg".format(images))
        
        file_name.append("./images/image{}.jpeg".format(images))
        
        images += 1
        
    elif f.endswith(".png"):
        dl_from_url(f, "./images/image{}.png".format(images))
        
        file_name.append("./images/image{}.png".format(images))
        
        images += 1

    elif f.endswith(".pdf"):
        dl_from_url(f, "./pdf/pdf{}.pdf".format(pdfs))
        
        file_name.append("./pdf/pdf{}.pdf".format(pdfs))
        
        pdfs += 1

    else: #It's a document
        dl_from_url(f, "./doc/doc{}.docx".format(docs))
        
        file_name.append("./doc/doc{}.docx".format(docs))
        
        docs += 1
            
    time.sleep(0.5)

# Add file_name as column  on files
files["file_name"] = file_name

# Display
display(files.head())

Unnamed: 0,url,text,files,file_name
0,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf1.pdf
1,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf2.pdf
2,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image1.png
3,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image2.png
4,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image3.png


In [17]:
# Get file size
files["file_size"] = files["file_name"].apply(os.path.getsize)

# Display
display(files)

Unnamed: 0,url,text,files,file_name,file_size
0,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf1.pdf,107332
1,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf2.pdf,163607
2,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image1.png,238605
3,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image2.png,253707
4,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image3.png,235907
...,...,...,...,...,...
362,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf195.pdf,5554966
363,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf196.pdf,196805
364,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf197.pdf,754059
365,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf198.pdf,22500441


In [18]:
# For PDFs, some of the urls have been archived, but by checking file size,
# we can tell which ones are archived to avoid them (they seem to be 3 KB)

# Keep PDFs that are greater than 3 KB
files = files.drop(files[(files["file_name"].str.contains("pdf")) & (files["file_size"] <= 3000)].index)

#NOTE: May need to do the same for documents

# Reset index
files = files.reset_index(drop=True)

# Drop file_size column, no longer need it
files = files.drop(columns=["file_size"])

# Display
display(files)

Unnamed: 0,url,text,files,file_name
0,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf1.pdf
1,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf2.pdf
2,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image1.png
3,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image2.png
4,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image3.png
...,...,...,...,...
250,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf194.pdf
251,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf195.pdf
252,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf196.pdf
253,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf197.pdf


In [21]:
# Extract text from files
files["file_text"] = files["file_name"].apply(extract_text)

# Should extract images from doc and pdf, and then run extract text again

In [22]:
files

Unnamed: 0,url,text,files,file_name,file_text
0,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf1.pdf,[ \nCity of Kingston \nReport to Council \nRep...
1,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./pdf/pdf2.pdf,[ \nCity of Kingston \nInformation Report to C...
2,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image1.png,[]
3,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image2.png,[fay\n\nool\nKole Joo Ome CSO:\n\ntox\nZa) = X...
4,https://www.cityofkingston.ca/city-hall/projec...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/10180/...,./images/image3.png,[]
...,...,...,...,...,...
250,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf194.pdf,[City of Kingston - Third Crossing of the Cata...
251,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf195.pdf,[City of Kingston - Third Crossing of the Cata...
252,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf196.pdf,[Rob Snetsinger \nEcological Services \n3803 S...
253,https://www.cityofkingston.ca/web/third-crossi...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.cityofkingston.ca/documents/264672...,./pdf/pdf197.pdf,[City of Kingston - Third Crossing of the Cata...


In [None]:
# Next steps: 

# Remove any rows with no text
files = files.dropna()

# Group by url and concatenate the file_text
test = files.groupby(["url"], as_index=False).agg({'file_text': ' '.join})

# Display test
display(test)
# If it works, replace test with files

# Reset index
#files = files.reset_index()

# Merge files back into df
#df = df.merge(files, how=left)

# Concatenate all text to text column
#df["text"] = df["text"] + " " + df["files_text"]

# Display
#display(df.head())

# Perform text preprocessing

In [64]:
# Text Preprocessing

# Extract all text from html string
df["clean_text"] = df["text"].apply(lambda x: BeautifulSoup(x).get_text(strip=True))
print("Text extracted")

# Lower case
df["clean_text"] = df["clean_text"].str.lower()
print("Text lowered")

# Remove unicode characters (emojis, etc.)
df["clean_text"] = df["clean_text"].str.encode('ascii', 'ignore').str.decode('utf-8')
print("Unicode characters removed")

# Remove urls
df["clean_text"] = df["clean_text"].str.replace(r'http*\S+', ' ', regex=True)
print("Urls removed")

# Remove multi-character symbols (\n, \t, \r, starting b')
df["clean_text"] = df["clean_text"].str.replace(r'\\n', ' ', regex=True)
df["clean_text"] = df["clean_text"].str.replace(r'\\r', ' ', regex=True)
df["clean_text"] = df["clean_text"].str.replace(r'\\t', ' ', regex=True)
df["clean_text"] = df["clean_text"].str.replace('b\'', '', regex=True)
print("Newline, carriage return, and tab characters removed")

# Remove numeric values
df["clean_text"] = df["clean_text"].str.replace(r'[0-9]', ' ', regex=True)
print("Numbers removed")

# Reduce repeated letters
df["clean_text"] = df["clean_text"].str.replace(re.compile(r"(.)\1{2,}"), r"\1\1", regex=True)
print("Reduced repeated characters")

# Remove stop words
pat = r'\b(?:{})\b'.format('|'.join(stopWords))
df["clean_text"] = df["clean_text"].str.replace(pat, '', regex=True)
print("Stop-words removed")

# Remove punctuation
df["clean_text"] = df["clean_text"].str.replace('[%s]' % re.escape(string.punctuation), ' ', regex=True)
print("Punctuation removed")

# Remove stop words again (in case stop word was next to punctuation)
df["clean_text"] = df["clean_text"].str.replace(pat, '', regex=True)
print("Stop-words removed again, in case stop word was next to punctuation")

# Remove extra blank spaces
df["clean_text"] = df["clean_text"].str.replace(r'\s{2,}', ' ', regex=True)
print("Extra blank spaces removed")

# Lemmatize
df["clean_text"] = df["clean_text"].apply(lemmatize)
print("Lemmatization completed")
print("Text preprocessing is done!")

# Check
df.head()

# Optional: Can drop all other columns, keeping only url and clean_text

Text extracted
Text lowered
Unicode characters removed
Urls removed
Newline, carriage return, and tab characters removed
Numbers removed
Reduced repeated characters
Stop-words removed
Punctuation removed
Stop-words removed again, in case stop word was next to punctuation
Blank spaces removed


Unnamed: 0,url,text,files,clean_text
0,https://www.cityofkingston.ca/,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[],home city kingston navigation skip content sk...
1,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[],poverty reduction city kingston navigation sk...
2,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[],ontario works city kingston navigation skip c...
3,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[],municipal fee assistance program city kingsto...
4,https://www.cityofkingston.ca/residents/commun...,b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,[],housing city kingston navigation skip content...


In [71]:
# Save to csv
df.to_csv("kingston_clean.csv", index=False)