In [0]:
%pip install autocorrect

In [0]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import time
  
import preprocessing as pp

## Text cleaning

Cleaning is an important part of text preprocessing. Our cleaning function helps remove things like HTML formatting, newline characters, simplify units, remove numbers, and make everything lowercase if desired.

We have to first define what regex pattern we are looking for and what to replace it with, for example: `('\\n', ' ')` will replace newline characters with empty spaces.

In [0]:
to_replace_paula = [('\\n', ' '), # remove newline characters which ruin everything
             ('\\r', ' '), # remove carriage returns which also wreak havoc
             ('\\t', ' '), # remove tab
             ('<.*?>', ' '), # remove everything between <>, HTML formatting
             ('\[.*?\]', ' '), # remove everything between []
             ('\/\*.*?;\}', ' '), # remove extra stubborn font formatting
             ('en-gb', ' '), # language formatting
             ('x-none', ' '), # other formatting
             ('\\xa0', ' '), # remove Latin1 non-breaking space
             ('&.*?;', ' '), # remove everything between & and ;
             ('tt\.', ' '), # remove instances of tt., HTML related formatting?
             ('tffs\.', ' '), # remove instances of tffs.
             # separating units from numbers:
             ('\d{1}mm\W{1}', ' mm '),
             ('\d{1}cm\W{1}', ' cm '),
             ('ω', 'w'),
             ('\d{1}kw\W{1}', ' kw '),
             # remove anything that has a number in it:
             ('\S*\d\S*', ' '),]

In [0]:
df['description'] = pp.clean_text(df['description'], to_replace_paula)
df['description'].values[5]

## Stopword removal

Stopwords are common words that do not add any information to the model about specific categories, and removing them can improve perfformance. It's important to customise this to your specific problem.

In [0]:
stopwords = ['a', 'b', 'c', 'd', 'e', 'f', 'g' 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z' \
             'about', 'also', 'although', 'at', 'all', 'at', 'as', 'an', 'any', 'anyway', 'are', 'and', 'by', 'be', 'but', 'because', 'been', \
             'being', 'can', 'cannot', 'could', 'do', 'done', 'else', 'for', 'from', 'get', 'go', 'had', 'has', 'have', 'here', 'how', 'if', \
             'in', 'is', 'it', 'no', 'nor', 'not', 'now', 'of', 'ok', 'on', 'or', 'out', 'so', 'than', 'that', 'the', 'then', 'there', 'these', \
             'this', 'to', 'was', 'we', 'were', 'why', 'will', 'with', 'would', 'th', 'tfl', 'tms', 'gov', 'uk' 'tel', 'pm', 'am', 'gmt', 'bst', \
             'hrs', 'jan', 'january', 'feb', 'february', 'mar', 'apr', 'april', 'may', 'jun', 'june', 'jul', 'july', 'aug', 'august', 'sep' \
             'september', 'oct', 'october', 'nov', 'november', 'dec', 'december', 'mon', 'monday', 'tue', 'tuesday', 'wed', 'wednesday' \
             'thu', 'thursday', 'fri', 'friday', 'sat', 'saturday', 'sun', 'sunday']

In [0]:
df['description'] = pp.remove_stopwords(df['description'], stopwords)
df['description'].values[5]

## Spellcheck

Spellchecking can help decrease the size of your vocabulary by grouping together words that should have been the same, but were accidentally misspelled.

In [0]:
df['description'] = pp.spellcheck(df, 'description', 5)
df['description'].values[5]

## Unpack acronyms

This function helps unpack acronyms, or you can keep the acronyms packed if desires.

In [0]:
#Acronyms
acronyms = [(r"(\A|\s)l(|\s)c(\s|$)", " lead car "), #Often seems to be a space in here...
             (r"(\A|\s)tt(\s|$)", " train technician "),
             (r"(\A|\s)dva(\s|$)", " digital voice announcer "),
             (r"(\A|\s)lopl(\s|$)", " loss of pilot light "),
             (r"(\A|\s)cfs(\s|$)", " cockfosters "),
             (r"(\A|\s)nfs(\s|$)", " northfields "),
             (r"(\A|\s)nfd(\s|$)", " northfields depot "),
             (r"(\A|\s)pcm(\s|$)", " pneumatic camshaft mechanism "),
             (r"(\A|\s)vlcv(\s|$)", " variable load control valve "),
             (r"(\A|\s)pb(\s|$)", " parking brake "),
             (r"(\A|\s)fst(\s|$)", " field shunt tips "),
             (r"(\A|\s)act(\s|$)", " acton "),
             (r"(\A|\s)ddm(\s|$)", " duty depot manager "),
             (r"(\A|\s)cdu(\s|$)", " cab display unit "),
             (r"(\A|\s)spad(\s|$)", " signal passed at danger "),
             (r"(\A|\s)ndf(\s|$)", " no defect found "),
             (r"(\A|\s)e(|\s)b(\s|$)", " eastbound "),
             (r"(\A|\s)w(|\s)b(\s|$)", " westbound "),
             (r"(\A|\s)op(\s|$)", " driver "),
             (r"(\A|\s)operator(\s|$)", " driver "),
             (r"(\A|\s)t op(\s|$)", " driver "),
             (r"(\A|\s)con(\s|$)", " control "),
             (r"(\A|\s)cp(\s|$)", " call point "),
             (r"(\A|\s)unt(\s|$)", " unit "),
             (r"(\A|\s)oos(\s|$)", " out of service ")
            ]

In [0]:
df['description'] = pp.unpack_acronyms(df['description'], acronyms)

In [0]:
df['description'].values[5]

## Stemming

Stemming removes the ends of words to try to keep only the root "stem" of them - e.g. "trains" becomes "train", or "walking" and "walked" becomes "walk." There are two options for stemmers: `Porter` and `Snowball`, the latter of which is generally accepted to be more robust.

In [0]:
df['description'] = pp.do_stemming(df['description'], stemmer='Snowball')

In [0]:
df['description'].values[5]

## Lemmatizing

Lemmatizing is similar to stemming with one extra step - it checks if the resulting word is in the English dictionary, and only stems if this is the case. This is a slower and less aggresive version of stemming.

In [0]:
df['description'] = pp.do_lemmatizing(df, 'description')

In [0]:
df['description'].values[5]