# Predicting Loan Defaults w/ Natural Language Processing (NLP) #
## 4.0 Text Processing ##

Created on Tues Feb 12 13:46:42 2019

@author: Greenwood Group

## STEP 4: Text Processing ##

### 4.1 Import required libraries ###

In [1]:
# System libraries
import os
import sys
import time
import warnings

# Basic Python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NLP libraries
import re # regular expressions library
import string
import spacy
import nltk # natural language toolkit library
import unidecode
import enchant
import gensim
import random

from nltk.corpus import stopwords # stopwords library
from nltk.stem import WordNetLemmatizer # lemmatization library
from textblob import TextBlob, Word
from bs4 import BeautifulSoup
from googletrans import Translator

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#nltk.download('stopwords') # download the stop words library
#nltk.download('all') # to download the complete nltk library
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

# ### Turn off Depreciation and Future warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

if not sys.warnoptions:
    warnings.simplefilter('ignore')
    
% matplotlib inline

pd.set_option('precision', 3) # set the precision of numerical representation
pd.set_option('expand_frame_repr', True)

### 4.2 Prepare for reproducability ###

In [2]:
# Set random seed to maintain reproducability
random_state = np.random.seed(42)

# Set the plotting style
plt.style.use('seaborn-whitegrid')

sns.set(style="whitegrid", palette="PRGn", color_codes=True, font='sans-serif', font_scale=1.5)

### 4.3 Import data ###

In [3]:
# Import the dataset
# Set the path to data
filename = '../data/processed/2.0-gg-preprocessed-data.csv'

# Read the data into memory
%time df = pd.read_csv(filename, keep_default_na=False)

CPU times: user 71.2 ms, sys: 16.6 ms, total: 87.9 ms
Wall time: 86.8 ms


### 4.4 Count number of words per record ###

In [4]:
%time df['word_count'] = df['description'].apply(lambda x: len(str(x).split(" ")))

df[['description', 'word_count']].sort_values(by=['word_count'], ascending=False).head(5)

CPU times: user 81.9 ms, sys: 1.84 ms, total: 83.8 ms
Wall time: 82.6 ms


Unnamed: 0,description,word_count
5486,<p>Wilson es nativo del sector arrocero de Bab...,1182
5490,"<p>Leonidas es un campesino de Baba, un sector...",1181
5488,<p>Emilio ha vivido en Baba por 35 aos cultiva...,1149
5487,<p>Emilio ha vivido en Baba por 35 aos cultiva...,1149
5485,<p>Pedro nacio y ha vivido en el sector arroce...,1139


### 4.5 Count the number of characters per record ###

In [5]:
%time df['char_count'] = df['description'].str.len()

df[['description', 'char_count']].sort_values(by=['char_count'], ascending=False).head(5)

CPU times: user 4.14 ms, sys: 257 µs, total: 4.4 ms
Wall time: 4.24 ms


Unnamed: 0,description,char_count
5490,"<p>Leonidas es un campesino de Baba, un sector...",6694
5486,<p>Wilson es nativo del sector arrocero de Bab...,6611
5488,<p>Emilio ha vivido en Baba por 35 aos cultiva...,6469
5487,<p>Emilio ha vivido en Baba por 35 aos cultiva...,6469
5485,<p>Pedro nacio y ha vivido en el sector arroce...,6453


### 4.6 Average word count ###

In [6]:
def avg_word(sentence):
    words = sentence.split()
    avg_words = 0
    
    if len(words) != 0:
        avg_words = (sum(len(word) for word in words) / len(words))
    else:
        avg_words = 0
    return avg_words

%time df['avg_word'] = df['description'].apply(lambda x: avg_word(x))

df[['description', 'avg_word']].sort_values(by=['avg_word'], ascending=False).head(5)

CPU times: user 192 ms, sys: 1.87 ms, total: 194 ms
Wall time: 193 ms


Unnamed: 0,description,avg_word
3564,"<p class=""MsoNormal"">Carmen Hernandez is a mot...",9.598
3565,"<p class=""MsoNormal"">Carmen Hernandez is a mot...",9.598
3566,"<p class=""MsoNormal"">Carmen Hernandez is a mot...",9.598
3567,"<p class=""MsoNormal"">Carmen Hernandez is a mot...",9.598
3604,"<p class=""MsoNormal""><span>Cesar Pilco, who wa...",9.021


### 4.7 Number of stopwords ###

In [7]:
stop = stopwords.words('english', 'spanish')

%time df['stopwords'] = df['description'].apply(lambda x: len([x for x in x.split() if x in stop]))

df[['description', 'stopwords']].sort_values(by=['stopwords'], ascending=False).head(5)

CPU times: user 1.94 s, sys: 2.63 ms, total: 1.95 s
Wall time: 1.95 s


Unnamed: 0,description,stopwords
5261,<p> Clara has lived around Baba for her entir...,458
5262,<p> Clara has lived around Baba for her entir...,458
5486,<p>Wilson es nativo del sector arrocero de Bab...,453
5296,<p> Guido produces rice and corn on lands giv...,452
1137,<p>Aurelio has a small piece of land that his ...,451


### 4.8 Number of numerics ###

In [8]:
%time df['numerics'] = df['description'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

df[['description', 'numerics']].sort_values(by=['numerics'], ascending=False).head(5)

CPU times: user 156 ms, sys: 1.87 ms, total: 158 ms
Wall time: 157 ms


Unnamed: 0,description,numerics
4681,Maria has 4 years of experience in book sales....,18
4682,Maria has 4 years of experience in book sales....,18
4683,Maria has 4 years of experience in book sales....,18
59,Beatrice is 39 years old single and with four ...,15
118,TERESIAH WANJIRU GITHIGA is 54 years of age wi...,15


### 4.9 Determine average reduction in text comments ###

In [9]:
# Select a random line to evaluate (Use same line throughout all steps below)
# Good example = 3597
#line = random.randint(0, len(df))

line=3597
#line=5490 # contains spanish
df['description'][line]

'<p class="MsoNormal"><span>Diana Farias, who was born in </span><a href="http://en.wikipedia.org/wiki/Quevedo,_Ecuador" target="_blank" title="Quevado"><span>Quevado</span></a><span>, </span><span>Ecuador</span><span>, is a 22 year-old single woman who resides with her parents. She runs a small gift shop in the Municipal de Bastin Popular that is known for its selection of perfumes. She has been in business for 5 years. Every morning from </span><span>6:00 am</span><span> until </span><span>2:00 pm</span><span> Diana and her mother operate their shop. Ms. Farias wants this loan to buy more products and diversify her stock. This hard-working young woman not only runs this business; she is wrapping up her studies at the University in Business Administration. </span></p>'

In [10]:
# Define function calculating average length of sentence
def getAvgLen(sentences):
    length = []
    
    for sentence in sentences:
        length.append(len(sentence))
    avg_len = np.mean(length)
    
    return avg_len

In [11]:
# Get the average length of each description
avg_len_beg = getAvgLen(df['description'])

print('Average description length: {}'.format(round(avg_len_beg, 2)))

Average description length: 1023.65


### 4.10 Remove html statements ###

In [12]:
# Remove boilerplate translation comments & HTML tags
%time df['normalized_en'] = df['description'].apply(lambda x: re.sub('<i>.*?</i>', ' ', x, flags=re.DOTALL))

# Remove HTML with BeautifulSoup
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: BeautifulSoup(x, "lxml").get_text())

# Remove special characters
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub('\r\n', ' ', x, flags=re.DOTALL))

print("\n")
df['normalized_en'][line]

CPU times: user 16.5 ms, sys: 462 µs, total: 16.9 ms
Wall time: 16.6 ms
CPU times: user 1.81 s, sys: 25.6 ms, total: 1.84 s
Wall time: 1.84 s
CPU times: user 17.1 ms, sys: 1.12 ms, total: 18.2 ms
Wall time: 18.1 ms




'Diana Farias, who was born in Quevado, Ecuador, is a 22 year-old single woman who resides with her parents. She runs a small gift shop in the Municipal de Bastin Popular that is known for its selection of perfumes. She has been in business for 5 years. Every morning from 6:00 am until 2:00 pm Diana and her mother operate their shop. Ms. Farias wants this loan to buy more products and diversify her stock. This hard-working young woman not only runs this business; she is wrapping up her studies at the University in Business Administration. '

In [13]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 3.382700691506641%


### 4.11 Remove Non-English text ###

In [14]:
# Remove non-English text
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub('(.*)(Translated by)', ' ', x, flags=re.DOTALL))
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub('(Translation from).*', ' ', x, flags=re.DOTALL))

print("\n")
df['normalized_en'][line]

CPU times: user 1min 6s, sys: 180 ms, total: 1min 6s
Wall time: 1min 7s
CPU times: user 11.1 ms, sys: 176 µs, total: 11.3 ms
Wall time: 11.3 ms




'Diana Farias, who was born in Quevado, Ecuador, is a 22 year-old single woman who resides with her parents. She runs a small gift shop in the Municipal de Bastin Popular that is known for its selection of perfumes. She has been in business for 5 years. Every morning from 6:00 am until 2:00 pm Diana and her mother operate their shop. Ms. Farias wants this loan to buy more products and diversify her stock. This hard-working young woman not only runs this business; she is wrapping up her studies at the University in Business Administration. '

In [16]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 3.9807003314692264%


### 4.11 Translate into English ###

In [17]:
# Translate any foreign content to English
#translator = Translator()
#translations = translator.translate(['The quick brown fox', 'jumps over', 'the lazy dog'], dest='ko')
#for translation in translations:
#    print(translation.origin, ' -> ', translation.text)

In [18]:
#import copy
#from googletrans import Translator

#translatedList = []
#for index, row in df.iterrows():
#    # REINITIALIZE THE API
#    translator = Translator()
#    newrow = copy.deepcopy(row)
#    try:
#        # translate the 'text' column
#        translated = translator.translate(row['normalized_en'], dest='en')
#        newrow['translated'] = translated.text
#    except Exception as e:
#        print(str(e))
#        continue
#    translatedList.append(newrow)

### 4.11 Normalize language to American English ###

In [19]:
# got list from http://www.tysto.com/uk-us-spelling-list.html
normalized_dict = {
    "accessorise" : "accessorize",
    "accessorised" : "accessorized",
    "accessorises" : "accessorizes",
    "accessorising" : "accessorizing",
    "acclimatisation" : "acclimatization",
    "acclimatise" : "acclimatize",
    "acclimatised" : "acclimatized",
    "acclimatises" : "acclimatizes",
    "acclimatising" : "acclimatizing",
    "accoutrements" : "accouterments",
    "aeon" : "eon",
    "aeons" : "eons",
    "aerogramme" : "aerogram",
    "aerogrammes" : "aerograms",
    "aeroplane" : "airplane",
    "aeroplanes" : "airplanes",
    "aesthete" : "esthete",
    "aesthetes" : "esthetes",
    "aesthetic" : "esthetic",
    "aesthetically" : "esthetically",
    "aesthetics" : "esthetics",
    "aetiology" : "etiology",
    "ageing" : "aging",
    "aggrandisement" : "aggrandizement",
    "agonise" : "agonize",
    "agonised" : "agonized",
    "agonises" : "agonizes",
    "agonising" : "agonizing",
    "agonisingly" : "agonizingly",
    "almanack" : "almanac",
    "almanacks" : "almanacs",
    "aluminium" : "aluminum",
    "amortisable" : "amortizable",
    "amortisation" : "amortization",
    "amortisations" : "amortizations",
    "amortise" : "amortize",
    "amortised" : "amortized",
    "amortises" : "amortizes",
    "amortising" : "amortizing",
    "amphitheatre" : "amphitheater",
    "amphitheatres" : "amphitheaters",
    "anaemia" : "anemia",
    "anaemic" : "anemic",
    "anaesthesia" : "anesthesia",
    "anaesthetic" : "anesthetic",
    "anaesthetics" : "anesthetics",
    "anaesthetise" : "anesthetize",
    "anaesthetised" : "anesthetized",
    "anaesthetises" : "anesthetizes",
    "anaesthetising" : "anesthetizing",
    "anaesthetist" : "anesthetist",
    "anaesthetists" : "anesthetists",
    "anaesthetize" : "anesthetize",
    "anaesthetized" : "anesthetized",
    "anaesthetizes" : "anesthetizes",
    "anaesthetizing" : "anesthetizing",
    "analogue" : "analog",
    "analogues" : "analogs",
    "analyse" : "analyze",
    "analysed" : "analyzed",
    "analyses" : "analyzes",
    "analysing" : "analyzing",
    "anglicise" : "anglicize",
    "anglicised" : "anglicized",
    "anglicises" : "anglicizes",
    "anglicising" : "anglicizing",
    "annualised" : "annualized",
    "antagonise" : "antagonize",
    "antagonised" : "antagonized",
    "antagonises" : "antagonizes",
    "antagonising" : "antagonizing",
    "apologise" : "apologize",
    "apologised" : "apologized",
    "apologises" : "apologizes",
    "apologising" : "apologizing",
    "appal" : "appall",
    "appals" : "appalls",
    "appetiser" : "appetizer",
    "appetisers" : "appetizers",
    "appetising" : "appetizing",
    "appetisingly" : "appetizingly",
    "arbour" : "arbor",
    "arbours" : "arbors",
    "archaeological" : "archeological",
    "archaeologically" : "archeologically",
    "archaeologist" : "archeologist",
    "archaeologists" : "archeologists",
    "archaeology" : "archeology",
    "ardour" : "ardor",
    "armour" : "armor",
    "armoured" : "armored",
    "armourer" : "armorer",
    "armourers" : "armorers",
    "armouries" : "armories",
    "armoury" : "armory",
    "artefact" : "artifact",
    "artefacts" : "artifacts",
    "authorise" : "authorize",
    "authorised" : "authorized",
    "authorises" : "authorizes",
    "authorising" : "authorizing",
    "axe" : "ax",
    "backpedalled" : "backpedaled",
    "backpedalling" : "backpedaling",
    "bannister" : "banister",
    "bannisters" : "banisters",
    "baptise" : "baptize",
    "baptised" : "baptized",
    "baptises" : "baptizes",
    "baptising" : "baptizing",
    "bastardise" : "bastardize",
    "bastardised" : "bastardized",
    "bastardises" : "bastardizes",
    "bastardising" : "bastardizing",
    "battleaxe" : "battleax",
    "baulk" : "balk",
    "baulked" : "balked",
    "baulking" : "balking",
    "baulks" : "balks",
    "bedevilled" : "bedeviled",
    "bedevilling" : "bedeviling",
    "behaviour" : "behavior",
    "behavioural" : "behavioral",
    "behaviourism" : "behaviorism",
    "behaviourist" : "behaviorist",
    "behaviourists" : "behaviorists",
    "behaviours" : "behaviors",
    "behove" : "behoove",
    "behoved" : "behooved",
    "behoves" : "behooves",
    "bejewelled" : "bejeweled",
    "belabour" : "belabor",
    "belaboured" : "belabored",
    "belabouring" : "belaboring",
    "belabours" : "belabors",
    "bevelled" : "beveled",
    "bevvies" : "bevies",
    "bevvy" : "bevy",
    "biassed" : "biased",
    "biassing" : "biasing",
    "bingeing" : "binging",
    "bougainvillaea" : "bougainvillea",
    "bougainvillaeas" : "bougainvilleas",
    "bowdlerise" : "bowdlerize",
    "bowdlerised" : "bowdlerized",
    "bowdlerises" : "bowdlerizes",
    "bowdlerising" : "bowdlerizing",
    "breathalyse" : "breathalyze",
    "breathalysed" : "breathalyzed",
    "breathalyser" : "breathalyzer",
    "breathalysers" : "breathalyzers",
    "breathalyses" : "breathalyzes",
    "breathalysing" : "breathalyzing",
    "brutalise" : "brutalize",
    "brutalised" : "brutalized",
    "brutalises" : "brutalizes",
    "brutalising" : "brutalizing",
    "buses" : "busses",
    "busing" : "bussing",
    "caesarean" : "cesarean",
    "caesareans" : "cesareans",
    "calibre" : "caliber",
    "calibres" : "calibers",
    "calliper" : "caliper",
    "callipers" : "calipers",
    "callisthenics" : "calisthenics",
    "canalise" : "canalize",
    "canalised" : "canalized",
    "canalises" : "canalizes",
    "canalising" : "canalizing",
    "cancellation" : "cancelation",
    "cancellations" : "cancelations",
    "cancelled" : "canceled",
    "cancelling" : "canceling",
    "candour" : "candor",
    "cannibalise" : "cannibalize",
    "cannibalised" : "cannibalized",
    "cannibalises" : "cannibalizes",
    "cannibalising" : "cannibalizing",
    "canonise" : "canonize",
    "canonised" : "canonized",
    "canonises" : "canonizes",
    "canonising" : "canonizing",
    "capitalise" : "capitalize",
    "capitalised" : "capitalized",
    "capitalises" : "capitalizes",
    "capitalising" : "capitalizing",
    "caramelise" : "caramelize",
    "caramelised" : "caramelized",
    "caramelises" : "caramelizes",
    "caramelising" : "caramelizing",
    "carbonise" : "carbonize",
    "carbonised" : "carbonized",
    "carbonises" : "carbonizes",
    "carbonising" : "carbonizing",
    "carolled" : "caroled",
    "carolling" : "caroling",
    "catalogue " : "catalog ",
    "catalogue." : "catalog.",
    "catalogued" : "cataloged",
    "catalogues" : "catalogs",
    "cataloguing" : "cataloging",
    "catalyse" : "catalyze",
    "catalysed" : "catalyzed",
    "catalyses" : "catalyzes",
    "catalysing" : "catalyzing",
    "categorise" : "categorize",
    "categorised" : "categorized",
    "categorises" : "categorizes",
    "categorising" : "categorizing",
    "cauterise" : "cauterize",
    "cauterised" : "cauterized",
    "cauterises" : "cauterizes",
    "cauterising" : "cauterizing",
    "cavilled" : "caviled",
    "cavilling" : "caviling",
    "centigramme" : "centigram",
    "centigrammes" : "centigrams",
    "centilitre" : "centiliter",
    "centilitres" : "centiliters",
    "centimetre" : "centimeter",
    "centimetres" : "centimeters",
    "centralise" : "centralize",
    "centralised" : "centralized",
    "centralises" : "centralizes",
    "centralising" : "centralizing",
    "centre" : "center",
    "centred" : "centered",
    "centrefold" : "centerfold",
    "centrefolds" : "centerfolds",
    "centrepiece" : "centerpiece",
    "centrepieces" : "centerpieces",
    "centres" : "centers",
    "channelled" : "channeled",
    "channelling" : "channeling",
    "characterise" : "characterize",
    "characterised" : "characterized",
    "characterises" : "characterizes",
    "characterising" : "characterizing",
    "cheque" : "check",
    "chequebook" : "checkbook",
    "chequebooks" : "checkbooks",
    "chequered" : "checkered",
    "cheques" : "checks",
    "chilli" : "chili",
    "chimaera" : "chimera",
    "chimaeras" : "chimeras",
    "chiselled" : "chiseled",
    "chiselling" : "chiseling",
    "circularise" : "circularize",
    "circularised" : "circularized",
    "circularises" : "circularizes",
    "circularising" : "circularizing",
    "civilise" : "civilize",
    "civilised" : "civilized",
    "civilises" : "civilizes",
    "civilising" : "civilizing",
    "clamour" : "clamor",
    "clamoured" : "clamored",
    "clamouring" : "clamoring",
    "clamours" : "clamors",
    "clangour" : "clangor",
    "clarinettist" : "clarinetist",
    "clarinettists" : "clarinetists",
    "collectivise" : "collectivize",
    "collectivised" : "collectivized",
    "collectivises" : "collectivizes",
    "collectivising" : "collectivizing",
    "colonisation" : "colonization",
    "colonise" : "colonize",
    "colonised" : "colonized",
    "coloniser" : "colonizer",
    "colonisers" : "colonizers",
    "colonises" : "colonizes",
    "colonising" : "colonizing",
    "colour" : "color",
    "colourant" : "colorant",
    "colourants" : "colorants",
    "coloured" : "colored",
    "coloureds" : "coloreds",
    "colourful" : "colorful",
    "colourfully" : "colorfully",
    "colouring" : "coloring",
    "colourize" : "colorize",
    "colourized" : "colorized",
    "colourizes" : "colorizes",
    "colourizing" : "colorizing",
    "colourless" : "colorless",
    "colours" : "colors",
    "commercialise" : "commercialize",
    "commercialised" : "commercialized",
    "commercialises" : "commercializes",
    "commercialising" : "commercializing",
    "compartmentalise" : "compartmentalize",
    "compartmentalised" : "compartmentalized",
    "compartmentalises" : "compartmentalizes",
    "compartmentalising" : "compartmentalizing",
    "computerise" : "computerize",
    "computerised" : "computerized",
    "computerises" : "computerizes",
    "computerising" : "computerizing",
    "conceptualise" : "conceptualize",
    "conceptualised" : "conceptualized",
    "conceptualises" : "conceptualizes",
    "conceptualising" : "conceptualizing",
    "connexion" : "connection",
    "connexions" : "connections",
    "contextualise" : "contextualize",
    "contextualised" : "contextualized",
    "contextualises" : "contextualizes",
    "contextualising" : "contextualizing",
    "cosier" : "cozier",
    "cosies" : "cozies",
    "cosiest" : "coziest",
    "cosily" : "cozily",
    "cosiness" : "coziness",
    "cosy" : "cozy",
    "councillor" : "councilor",
    "councillors" : "councilors",
    "counselled" : "counseled",
    "counselling" : "counseling",
    "counsellor" : "counselor",
    "counsellors" : "counselors",
    "crenellated" : "crenelated",
    "criminalise" : "criminalize",
    "criminalised" : "criminalized",
    "criminalises" : "criminalizes",
    "criminalising" : "criminalizing",
    "criticise" : "criticize",
    "criticised" : "criticized",
    "criticises" : "criticizes",
    "criticising" : "criticizing",
    "crueller" : "crueler",
    "cruellest" : "cruelest",
    "crystallisation" : "crystallization",
    "crystallise" : "crystallize",
    "crystallised" : "crystallized",
    "crystallises" : "crystallizes",
    "crystallising" : "crystallizing",
    "cudgelled" : "cudgeled",
    "cudgelling" : "cudgeling",
    "customise" : "customize",
    "customised" : "customized",
    "customises" : "customizes",
    "customising" : "customizing",
    "cypher" : "cipher",
    "cyphers" : "ciphers",
    "decentralisation" : "decentralization",
    "decentralise" : "decentralize",
    "decentralised" : "decentralized",
    "decentralises" : "decentralizes",
    "decentralising" : "decentralizing",
    "decriminalisation" : "decriminalization",
    "decriminalise" : "decriminalize",
    "decriminalised" : "decriminalized",
    "decriminalises" : "decriminalizes",
    "decriminalising" : "decriminalizing",
    "defence" : "defense",
    "defenceless" : "defenseless",
    "defences" : "defenses",
    "dehumanisation" : "dehumanization",
    "dehumanise" : "dehumanize",
    "dehumanised" : "dehumanized",
    "dehumanises" : "dehumanizes",
    "dehumanising" : "dehumanizing",
    "demeanour" : "demeanor",
    "demilitarisation" : "demilitarization",
    "demilitarise" : "demilitarize",
    "demilitarised" : "demilitarized",
    "demilitarises" : "demilitarizes",
    "demilitarising" : "demilitarizing",
    "demobilisation" : "demobilization",
    "demobilise" : "demobilize",
    "demobilised" : "demobilized",
    "demobilises" : "demobilizes",
    "demobilising" : "demobilizing",
    "democratisation" : "democratization",
    "democratise" : "democratize",
    "democratised" : "democratized",
    "democratises" : "democratizes",
    "democratising" : "democratizing",
    "demonise" : "demonize",
    "demonised" : "demonized",
    "demonises" : "demonizes",
    "demonising" : "demonizing",
    "demoralisation" : "demoralization",
    "demoralise" : "demoralize",
    "demoralised" : "demoralized",
    "demoralises" : "demoralizes",
    "demoralising" : "demoralizing",
    "denationalisation" : "denationalization",
    "denationalise" : "denationalize",
    "denationalised" : "denationalized",
    "denationalises" : "denationalizes",
    "denationalising" : "denationalizing",
    "deodorise" : "deodorize",
    "deodorised" : "deodorized",
    "deodorises" : "deodorizes",
    "deodorising" : "deodorizing",
    "depersonalise" : "depersonalize",
    "depersonalised" : "depersonalized",
    "depersonalises" : "depersonalizes",
    "depersonalising" : "depersonalizing",
    "deputise" : "deputize",
    "deputised" : "deputized",
    "deputises" : "deputizes",
    "deputising" : "deputizing",
    "desensitisation" : "desensitization",
    "desensitise" : "desensitize",
    "desensitised" : "desensitized",
    "desensitises" : "desensitizes",
    "desensitising" : "desensitizing",
    "destabilisation" : "destabilization",
    "destabilise" : "destabilize",
    "destabilised" : "destabilized",
    "destabilises" : "destabilizes",
    "destabilising" : "destabilizing",
    "dialled" : "dialed",
    "dialling" : "dialing",
    "dialogue" : "dialog",
    "dialogues" : "dialogs",
    "diarrhoea" : "diarrhea",
    "digitise" : "digitize",
    "digitised" : "digitized",
    "digitises" : "digitizes",
    "digitising" : "digitizing",
    "disc " : "disk ",
    "disc." : "disk.",
    "disc : " : "disk : ",
    "disc?" : "disk?",
    "disc;" : "disk;",
    "disc-" : "disk-",
    "discolour" : "discolor",
    "discoloured" : "discolored",
    "discolouring" : "discoloring",
    "discolours" : "discolors",
    "discs" : "disks",
    "discy" : "disky",
    "disembowelled" : "disemboweled",
    "disembowelling" : "disemboweling",
    "disfavour" : "disfavor",
    "dishevelled" : "disheveled",
    "dishonour" : "dishonor",
    "dishonourable" : "dishonorable",
    "dishonourably" : "dishonorably",
    "dishonoured" : "dishonored",
    "dishonouring" : "dishonoring",
    "dishonours" : "dishonors",
    "disorganisation" : "disorganization",
    "disorganised" : "disorganized",
    "distil" : "distill",
    "distils" : "distills",
    "dramatisation" : "dramatization",
    "dramatisations" : "dramatizations",
    "dramatise" : "dramatize",
    "dramatised" : "dramatized",
    "dramatises" : "dramatizes",
    "dramatising" : "dramatizing",
    "draught" : "draft",
    "draughtboard" : "draftboard",
    "draughtboards" : "draftboards",
    "draughtier" : "draftier",
    "draughtiest" : "draftiest",
    "draughts" : "drafts",
    "draughtsman" : "draftsman",
    "draughtsmanship" : "draftsmanship",
    "draughtsmen" : "draftsmen",
    "draughtswoman" : "draftswoman",
    "draughtswomen" : "draftswomen",
    "draughty" : "drafty",
    "drivelled" : "driveled",
    "drivelling" : "driveling",
    "duelled" : "dueled",
    "duelling" : "dueling",
    "economise" : "economize",
    "economised" : "economized",
    "economises" : "economizes",
    "economising" : "economizing",
    "edoema" : "edema",
    "editorialise" : "editorialize",
    "editorialised" : "editorialized",
    "editorialises" : "editorializes",
    "editorialising" : "editorializing",
    "empathise" : "empathize",
    "empathised" : "empathized",
    "empathises" : "empathizes",
    "empathising" : "empathizing",
    "emphasise" : "emphasize",
    "emphasised" : "emphasized",
    "emphasises" : "emphasizes",
    "emphasising" : "emphasizing",
    "enamelled" : "enameled",
    "enamelling" : "enameling",
    "enamoured" : "enamored",
    "encyclopaedia" : "encyclopedia",
    "encyclopaedias" : "encyclopedias",
    "encyclopaedic" : "encyclopedic",
    "endeavour" : "endeavor",
    "endeavoured" : "endeavored",
    "endeavouring" : "endeavoring",
    "endeavours" : "endeavors",
    "energise" : "energize",
    "energised" : "energized",
    "energises" : "energizes",
    "energising" : "energizing",
    "enrol" : "enroll",
    "enrols" : "enrolls",
    "enthral" : "enthrall",
    "enthrals" : "enthralls",
    "epaulette" : "epaulet",
    "epaulettes" : "epaulets",
    "epicentre" : "epicenter",
    "epicentres" : "epicenters",
    "epilogue" : "epilog",
    "epilogues" : "epilogs",
    "epitomise" : "epitomize",
    "epitomised" : "epitomized",
    "epitomises" : "epitomizes",
    "epitomising" : "epitomizing",
    "equalisation" : "equalization",
    "equalise" : "equalize",
    "equalised" : "equalized",
    "equaliser" : "equalizer",
    "equalisers" : "equalizers",
    "equalises" : "equalizes",
    "equalising" : "equalizing",
    "eulogise" : "eulogize",
    "eulogised" : "eulogized",
    "eulogises" : "eulogizes",
    "eulogising" : "eulogizing",
    "evangelise" : "evangelize",
    "evangelised" : "evangelized",
    "evangelises" : "evangelizes",
    "evangelising" : "evangelizing",
    "exorcise" : "exorcize",
    "exorcised" : "exorcized",
    "exorcises" : "exorcizes",
    "exorcising" : "exorcizing",
    "extemporisation" : "extemporization",
    "extemporise" : "extemporize",
    "extemporised" : "extemporized",
    "extemporises" : "extemporizes",
    "extemporising" : "extemporizing",
    "externalisation" : "externalization",
    "externalisations" : "externalizations",
    "externalise" : "externalize",
    "externalised" : "externalized",
    "externalises" : "externalizes",
    "externalising" : "externalizing",
    "factorise" : "factorize",
    "factorised" : "factorized",
    "factorises" : "factorizes",
    "factorising" : "factorizing",
    "faecal" : "fecal",
    "faeces" : "feces",
    "familiarisation" : "familiarization",
    "familiarise" : "familiarize",
    "familiarised" : "familiarized",
    "familiarises" : "familiarizes",
    "familiarising" : "familiarizing",
    "fantasise" : "fantasize",
    "fantasised" : "fantasized",
    "fantasises" : "fantasizes",
    "fantasising" : "fantasizing",
    "favour" : "favor",
    "favourable" : "favorable",
    "favourably" : "favorably",
    "favoured" : "favored",
    "favouring" : "favoring",
    "favourite" : "favorite",
    "favourites" : "favorites",
    "favouritism" : "favoritism",
    "favours" : "favors",
    "feminise" : "feminize",
    "feminised" : "feminized",
    "feminises" : "feminizes",
    "feminising" : "feminizing",
    "fertilisation" : "fertilization",
    "fertilise" : "fertilize",
    "fertilised" : "fertilized",
    "fertiliser" : "fertilizer",
    "fertilisers" : "fertilizers",
    "fertilises" : "fertilizes",
    "fertilising" : "fertilizing",
    "fervour" : "fervor",
    "fibre" : "fiber",
    "fibreglass" : "fiberglass",
    "fibres" : "fibers",
    "fictionalisation" : "fictionalization",
    "fictionalisations" : "fictionalizations",
    "fictionalise" : "fictionalize",
    "fictionalised" : "fictionalized",
    "fictionalises" : "fictionalizes",
    "fictionalising" : "fictionalizing",
    "fillet" : "filet",
    "filleted" : "fileted",
    "filleting" : "fileting",
    "fillets" : "filets",
    "finalisation" : "finalization",
    "finalise" : "finalize",
    "finalised" : "finalized",
    "finalises" : "finalizes",
    "finalising" : "finalizing",
    "flautist" : "flutist",
    "flautists" : "flutists",
    "flavour" : "flavor",
    "flavoured" : "flavored",
    "flavouring" : "flavoring",
    "flavourings" : "flavorings",
    "flavourless" : "flavorless",
    "flavours" : "flavors",
    "flavoursome" : "flavorsome",
    "flyer/flier" : "flier/flyer",
    "foetal" : "fetal",
    "foetid" : "fetid",
    "foetus" : "fetus",
    "foetuses" : "fetuses",
    "formalisation" : "formalization",
    "formalise" : "formalize",
    "formalised" : "formalized",
    "formalises" : "formalizes",
    "formalising" : "formalizing",
    "fossilisation" : "fossilization",
    "fossilise" : "fossilize",
    "fossilised" : "fossilized",
    "fossilises" : "fossilizes",
    "fossilising" : "fossilizing",
    "fraternisation" : "fraternization",
    "fraternise" : "fraternize",
    "fraternised" : "fraternized",
    "fraternises" : "fraternizes",
    "fraternising" : "fraternizing",
    "fulfil" : "fulfill",
    "fulfilment" : "fulfillment",
    "fulfils" : "fulfills",
    "funnelled" : "funneled",
    "funnelling" : "funneling",
    "galvanise" : "galvanize",
    "galvanised" : "galvanized",
    "galvanises" : "galvanizes",
    "galvanising" : "galvanizing",
    "gambolled" : "gamboled",
    "gambolling" : "gamboling",
    "gaol" : "jail",
    "gaolbird" : "jailbird",
    "gaolbirds" : "jailbirds",
    "gaolbreak" : "jailbreak",
    "gaolbreaks" : "jailbreaks",
    "gaoled" : "jailed",
    "gaoler" : "jailer",
    "gaolers" : "jailers",
    "gaoling" : "jailing",
    "gaols" : "jails",
    "gases" : "gasses",
    "gauge" : "gage",
    "gauged" : "gaged",
    "gauges" : "gages",
    "gauging" : "gaging",
    "generalisation" : "generalization",
    "generalisations" : "generalizations",
    "generalise" : "generalize",
    "generalised" : "generalized",
    "generalises" : "generalizes",
    "generalising" : "generalizing",
    "ghettoise" : "ghettoize",
    "ghettoised" : "ghettoized",
    "ghettoises" : "ghettoizes",
    "ghettoising" : "ghettoizing",
    "gipsies" : "gypsies",
    "glamorise" : "glamorize",
    "glamorised" : "glamorized",
    "glamorises" : "glamorizes",
    "glamorising" : "glamorizing",
    "glamour" : "glamor",
    "globalisation" : "globalization",
    "globalise" : "globalize",
    "globalised" : "globalized",
    "globalises" : "globalizes",
    "globalising" : "globalizing",
    "glueing" : "gluing",
    "goitre" : "goiter",
    "goitres" : "goiters",
    "gonorrhoea" : "gonorrhea",
    "gramme" : "gram",
    "grammes" : "grams",
    "gravelled" : "graveled",
    "grey" : "gray",
    "greyed" : "grayed",
    "greying" : "graying",
    "greyish" : "grayish",
    "greyness" : "grayness",
    "greys" : "grays",
    "grovelled" : "groveled",
    "grovelling" : "groveling",
    "groyne" : "groin",
    "groynes" : "groins",
    "gruelling" : "grueling",
    "gruellingly" : "gruelingly",
    "gryphon" : "griffin",
    "gryphons" : "griffins",
    "gynaecological" : "gynecological",
    "gynaecologist" : "gynecologist",
    "gynaecologists" : "gynecologists",
    "gynaecology" : "gynecology",
    "haematological" : "hematological",
    "haematologist" : "hematologist",
    "haematologists" : "hematologists",
    "haematology" : "hematology",
    "haemoglobin" : "hemoglobin",
    "haemophilia" : "hemophilia",
    "haemophiliac" : "hemophiliac",
    "haemophiliacs" : "hemophiliacs",
    "haemorrhage" : "hemorrhage",
    "haemorrhaged" : "hemorrhaged",
    "haemorrhages" : "hemorrhages",
    "haemorrhaging" : "hemorrhaging",
    "haemorrhoids" : "hemorrhoids",
    "harbour" : "harbor",
    "harboured" : "harbored",
    "harbouring" : "harboring",
    "harbours" : "harbors",
    "harmonisation" : "harmonization",
    "harmonise" : "harmonize",
    "harmonised" : "harmonized",
    "harmonises" : "harmonizes",
    "harmonising" : "harmonizing",
    "homoeopath" : "homeopath",
    "homoeopathic" : "homeopathic",
    "homoeopaths" : "homeopaths",
    "homoeopathy" : "homeopathy",
    "homogenise" : "homogenize",
    "homogenised" : "homogenized",
    "homogenises" : "homogenizes",
    "homogenising" : "homogenizing",
    "honour" : "honor",
    "honourable" : "honorable",
    "honourably" : "honorably",
    "honoured" : "honored",
    "honouring" : "honoring",
    "honours" : "honors",
    "hospitalisation" : "hospitalization",
    "hospitalise" : "hospitalize",
    "hospitalised" : "hospitalized",
    "hospitalises" : "hospitalizes",
    "hospitalising" : "hospitalizing",
    "humanise" : "humanize",
    "humanised" : "humanized",
    "humanises" : "humanizes",
    "humanising" : "humanizing",
    "humour" : "humor",
    "humoured" : "humored",
    "humouring" : "humoring",
    "humourless" : "humorless",
    "humours" : "humors",
    "hybridise" : "hybridize",
    "hybridised" : "hybridized",
    "hybridises" : "hybridizes",
    "hybridising" : "hybridizing",
    "hypnotise" : "hypnotize",
    "hypnotised" : "hypnotized",
    "hypnotises" : "hypnotizes",
    "hypnotising" : "hypnotizing",
    "hypothesise" : "hypothesize",
    "hypothesised" : "hypothesized",
    "hypothesises" : "hypothesizes",
    "hypothesising" : "hypothesizing",
    "idealisation" : "idealization",
    "idealise" : "idealize",
    "idealised" : "idealized",
    "idealises" : "idealizes",
    "idealising" : "idealizing",
    "idolise" : "idolize",
    "idolised" : "idolized",
    "idolises" : "idolizes",
    "idolising" : "idolizing",
    "immobilisation" : "immobilization",
    "immobilise" : "immobilize",
    "immobilised" : "immobilized",
    "immobiliser" : "immobilizer",
    "immobilisers" : "immobilizers",
    "immobilises" : "immobilizes",
    "immobilising" : "immobilizing",
    "immortalise" : "immortalize",
    "immortalised" : "immortalized",
    "immortalises" : "immortalizes",
    "immortalising" : "immortalizing",
    "immunisation" : "immunization",
    "immunise" : "immunize",
    "immunised" : "immunized",
    "immunises" : "immunizes",
    "immunising" : "immunizing",
    "impanelled" : "impaneled",
    "impanelling" : "impaneling",
    "imperilled" : "imperiled",
    "imperilling" : "imperiling",
    "individualise" : "individualize",
    "individualised" : "individualized",
    "individualises" : "individualizes",
    "individualising" : "individualizing",
    "industrialise" : "industrialize",
    "industrialised" : "industrialized",
    "industrialises" : "industrializes",
    "industrialising" : "industrializing",
    "inflexion" : "inflection",
    "inflexions" : "inflections",
    "initialise" : "initialize",
    "initialised" : "initialized",
    "initialises" : "initializes",
    "initialising" : "initializing",
    "initialled" : "initialed",
    "initialling" : "initialing",
    "instal" : "install",
    "instalment" : "installment",
    "instalments" : "installments",
    "instals" : "installs",
    "instil" : "instill",
    "instils" : "instills",
    "institutionalisation" : "institutionalization",
    "institutionalise" : "institutionalize",
    "institutionalised" : "institutionalized",
    "institutionalises" : "institutionalizes",
    "institutionalising" : "institutionalizing",
    "intellectualise" : "intellectualize",
    "intellectualised" : "intellectualized",
    "intellectualises" : "intellectualizes",
    "intellectualising" : "intellectualizing",
    "internalisation" : "internalization",
    "internalise" : "internalize",
    "internalised" : "internalized",
    "internalises" : "internalizes",
    "internalising" : "internalizing",
    "internationalisation" : "internationalization",
    "internationalise" : "internationalize",
    "internationalised" : "internationalized",
    "internationalises" : "internationalizes",
    "internationalising" : "internationalizing",
    "ionisation" : "ionization",
    "ionise" : "ionize",
    "ionised" : "ionized",
    "ioniser" : "ionizer",
    "ionisers" : "ionizers",
    "ionises" : "ionizes",
    "ionising" : "ionizing",
    "italicise" : "italicize",
    "italicised" : "italicized",
    "italicises" : "italicizes",
    "italicising" : "italicizing",
    "itemise" : "itemize",
    "itemised" : "itemized",
    "itemises" : "itemizes",
    "itemising" : "itemizing",
    "jeopardise" : "jeopardize",
    "jeopardised" : "jeopardized",
    "jeopardises" : "jeopardizes",
    "jeopardising" : "jeopardizing",
    "jewelled" : "jeweled",
    "jeweller" : "jeweler",
    "jewellers" : "jewelers",
    "jewellery" : "jewelry",
    "judgement" : "judgment",
    "kilogramme" : "kilogram",
    "kilogrammes" : "kilograms",
    "kilometre" : "kilometer",
    "kilometres" : "kilometers",
    "labelled" : "labeled",
    "labelling" : "labeling",
    "labour" : "labor",
    "laboured" : "labored",
    "labourer" : "laborer",
    "labourers" : "laborers",
    "labouring" : "laboring",
    "labours" : "labors",
    "lacklustre" : "lackluster",
    "legalisation" : "legalization",
    "legalise" : "legalize",
    "legalised" : "legalized",
    "legalises" : "legalizes",
    "legalising" : "legalizing",
    "legitimise" : "legitimize",
    "legitimised" : "legitimized",
    "legitimises" : "legitimizes",
    "legitimising" : "legitimizing",
    "leukaemia" : "leukemia",
    "levelled" : "leveled",
    "leveller" : "leveler",
    "levellers" : "levelers",
    "levelling" : "leveling",
    "libelled" : "libeled",
    "libelling" : "libeling",
    "libellous" : "libelous",
    "liberalisation" : "liberalization",
    "liberalise" : "liberalize",
    "liberalised" : "liberalized",
    "liberalises" : "liberalizes",
    "liberalising" : "liberalizing",
    "licence" : "license",
    "licenced" : "licensed",
    "licences" : "licenses",
    "licencing" : "licensing",
    "likeable" : "likable",
    "lionisation" : "lionization",
    "lionise" : "lionize",
    "lionised" : "lionized",
    "lionises" : "lionizes",
    "lionising" : "lionizing",
    "liquidise" : "liquidize",
    "liquidised" : "liquidized",
    "liquidiser" : "liquidizer",
    "liquidisers" : "liquidizers",
    "liquidises" : "liquidizes",
    "liquidising" : "liquidizing",
    " litre " : " liter ",
    " litres " : " liters ",
    "localise" : "localize",
    "localised" : "localized",
    "localises" : "localizes",
    "localising" : "localizing",
    "louvre" : "louver",
    "louvred" : "louvered",
    "louvres" : "louvers",
    "lustre" : "luster",
    "magnetise" : "magnetize",
    "magnetised" : "magnetized",
    "magnetises" : "magnetizes",
    "magnetising" : "magnetizing",
    "manoeuvrability" : "maneuverability",
    "manoeuvrable" : "maneuverable",
    "manoeuvre" : "maneuver",
    "manoeuvred" : "maneuvered",
    "manoeuvres" : "maneuvers",
    "manoeuvring" : "maneuvering",
    "manoeuvrings" : "maneuverings",
    "marginalisation" : "marginalization",
    "marginalise" : "marginalize",
    "marginalised" : "marginalized",
    "marginalises" : "marginalizes",
    "marginalising" : "marginalizing",
    "marshalled" : "marshaled",
    "marshalling" : "marshaling",
    "marvelled" : "marveled",
    "marvelling" : "marveling",
    "marvellous" : "marvelous",
    "marvellously" : "marvelously",
    "materialisation" : "materialization",
    "materialise" : "materialize",
    "materialised" : "materialized",
    "materialises" : "materializes",
    "materialising" : "materializing",
    "maximisation" : "maximization",
    "maximise" : "maximize",
    "maximised" : "maximized",
    "maximises" : "maximizes",
    "maximising" : "maximizing",
    "meagre" : "meager",
    "mechanisation" : "mechanization",
    "mechanise" : "mechanize",
    "mechanised" : "mechanized",
    "mechanises" : "mechanizes",
    "mechanising" : "mechanizing",
    "mediaeval" : "medieval",
    "memorialise" : "memorialize",
    "memorialised" : "memorialized",
    "memorialises" : "memorializes",
    "memorialising" : "memorializing",
    "memorise" : "memorize",
    "memorised" : "memorized",
    "memorises" : "memorizes",
    "memorising" : "memorizing",
    "mesmerise" : "mesmerize",
    "mesmerised" : "mesmerized",
    "mesmerises" : "mesmerizes",
    "mesmerising" : "mesmerizing",
    "metabolise" : "metabolize",
    "metabolised" : "metabolized",
    "metabolises" : "metabolizes",
    "metabolising" : "metabolizing",
    " metre" : " meter",
    " metres" : " meters",
    "micrometre" : "micrometer",
    "micrometres" : "micrometers",
    "militarise" : "militarize",
    "militarised" : "militarized",
    "militarises" : "militarizes",
    "militarising" : "militarizing",
    "milligramme" : "milligram",
    "milligrammes" : "milligrams",
    "millilitre" : "milliliter",
    "millilitres" : "milliliters",
    "millimetre" : "millimeter",
    "millimetres" : "millimeters",
    "miniaturisation" : "miniaturization",
    "miniaturise" : "miniaturize",
    "miniaturised" : "miniaturized",
    "miniaturises" : "miniaturizes",
    "miniaturising" : "miniaturizing",
    "minibuses" : "minibusses",
    "minimise" : "minimize",
    "minimised" : "minimized",
    "minimises" : "minimizes",
    "minimising" : "minimizing",
    "misbehaviour" : "misbehavior",
    "misdemeanour" : "misdemeanor",
    "misdemeanours" : "misdemeanors",
    "misspelt" : "misspelled",
    "mitre" : "miter",
    "mitres" : "miters",
    "mobilisation" : "mobilization",
    "mobilise" : "mobilize",
    "mobilised" : "mobilized",
    "mobilises" : "mobilizes",
    "mobilising" : "mobilizing",
    "modelled" : "modeled",
    "modeller" : "modeler",
    "modellers" : "modelers",
    "modelling" : "modeling",
    "modernise" : "modernize",
    "modernised" : "modernized",
    "modernises" : "modernizes",
    "modernising" : "modernizing",
    "moisturise" : "moisturize",
    "moisturised" : "moisturized",
    "moisturiser" : "moisturizer",
    "moisturisers" : "moisturizers",
    "moisturises" : "moisturizes",
    "moisturising" : "moisturizing",
    "monologue" : "monolog",
    "monologues" : "monologs",
    "monopolisation" : "monopolization",
    "monopolise" : "monopolize",
    "monopolised" : "monopolized",
    "monopolises" : "monopolizes",
    "monopolising" : "monopolizing",
    "moralise" : "moralize",
    "moralised" : "moralized",
    "moralises" : "moralizes",
    "moralising" : "moralizing",
    "motorised" : "motorized",
    "mould" : "mold",
    "moulded" : "molded",
    "moulder" : "molder",
    "mouldered" : "moldered",
    "mouldering" : "moldering",
    "moulders" : "molders",
    "mouldier" : "moldier",
    "mouldiest" : "moldiest",
    "moulding" : "molding",
    "mouldings" : "moldings",
    "moulds" : "molds",
    "mouldy" : "moldy",
    "moult" : "molt",
    "moulted" : "molted",
    "moulting" : "molting",
    "moults" : "molts",
    "moustache" : "mustache",
    "moustached" : "mustached",
    "moustaches" : "mustaches",
    "moustachioed" : "mustachioed",
    "multicoloured" : "multicolored",
    "nationalisation" : "nationalization",
    "nationalisations" : "nationalizations",
    "nationalise" : "nationalize",
    "nationalised" : "nationalized",
    "nationalises" : "nationalizes",
    "nationalising" : "nationalizing",
    "naturalisation" : "naturalization",
    "naturalise" : "naturalize",
    "naturalised" : "naturalized",
    "naturalises" : "naturalizes",
    "naturalising" : "naturalizing",
    "neighbour" : "neighbor",
    "neighbourhood" : "neighborhood",
    "neighbourhoods" : "neighborhoods",
    "neighbouring" : "neighboring",
    "neighbourliness" : "neighborliness",
    "neighbourly" : "neighborly",
    "neighbours" : "neighbors",
    "neutralisation" : "neutralization",
    "neutralise" : "neutralize",
    "neutralised" : "neutralized",
    "neutralises" : "neutralizes",
    "neutralising" : "neutralizing",
    "normalisation" : "normalization",
    "normalise" : "normalize",
    "normalised" : "normalized",
    "normalises" : "normalizes",
    "normalising" : "normalizing",
    "odour" : "odor",
    "odourless" : "odorless",
    "odours" : "odors",
    "oesophagus" : "esophagus",
    "oesophaguses" : "esophaguses",
    "oestrogen" : "estrogen",
    "offence" : "offense",
    "offences" : "offenses",
    "omelette" : "omelet",
    "omelettes" : "omelets",
    "optimise" : "optimize",
    "optimised" : "optimized",
    "optimises" : "optimizes",
    "optimising" : "optimizing",
    "organisation" : "organization",
    "organisational" : "organizational",
    "organisations" : "organizations",
    "organise" : "organize",
    "organised" : "organized",
    "organiser" : "organizer",
    "organisers" : "organizers",
    "organises" : "organizes",
    "organising" : "organizing",
    "orthopaedic" : "orthopedic",
    "orthopaedics" : "orthopedics",
    "ostracise" : "ostracize",
    "ostracised" : "ostracized",
    "ostracises" : "ostracizes",
    "ostracising" : "ostracizing",
    "outmanoeuvre" : "outmaneuver",
    "outmanoeuvred" : "outmaneuvered",
    "outmanoeuvres" : "outmaneuvers",
    "outmanoeuvring" : "outmaneuvering",
    "overemphasise" : "overemphasize",
    "overemphasised" : "overemphasized",
    "overemphasises" : "overemphasizes",
    "overemphasising" : "overemphasizing",
    "oxidisation" : "oxidization",
    "oxidise" : "oxidize",
    "oxidised" : "oxidized",
    "oxidises" : "oxidizes",
    "oxidising" : "oxidizing",
    "paederast" : "pederast",
    "paederasts" : "pederasts",
    "paediatric" : "pediatric",
    "paediatrician" : "pediatrician",
    "paediatricians" : "pediatricians",
    "paediatrics" : "pediatrics",
    "paedophile" : "pedophile",
    "paedophiles" : "pedophiles",
    "paedophilia" : "pedophilia",
    "palaeolithic" : "paleolithic",
    "palaeontologist" : "paleontologist",
    "palaeontologists" : "paleontologists",
    "palaeontology" : "paleontology",
    "panelled" : "paneled",
    "panelling" : "paneling",
    "panellist" : "panelist",
    "panellists" : "panelists",
    "paralyse" : "paralyze",
    "paralysed" : "paralyzed",
    "paralyses" : "paralyzes",
    "paralysing" : "paralyzing",
    "parcelled" : "parceled",
    "parcelling" : "parceling",
    "parlour" : "parlor",
    "parlours" : "parlors",
    "particularise" : "particularize",
    "particularised" : "particularized",
    "particularises" : "particularizes",
    "particularising" : "particularizing",
    "passivisation" : "passivization",
    "passivise" : "passivize",
    "passivised" : "passivized",
    "passivises" : "passivizes",
    "passivising" : "passivizing",
    "pasteurisation" : "pasteurization",
    "pasteurise" : "pasteurize",
    "pasteurised" : "pasteurized",
    "pasteurises" : "pasteurizes",
    "pasteurising" : "pasteurizing",
    "patronise" : "patronize",
    "patronised" : "patronized",
    "patronises" : "patronizes",
    "patronising" : "patronizing",
    "patronisingly" : "patronizingly",
    "pedalled" : "pedaled",
    "pedalling" : "pedaling",
    "pedestrianisation" : "pedestrianization",
    "pedestrianise" : "pedestrianize",
    "pedestrianised" : "pedestrianized",
    "pedestrianises" : "pedestrianizes",
    "pedestrianising" : "pedestrianizing",
    "penalise" : "penalize",
    "penalised" : "penalized",
    "penalises" : "penalizes",
    "penalising" : "penalizing",
    "pencilled" : "penciled",
    "pencilling" : "penciling",
    "personalise" : "personalize",
    "personalised" : "personalized",
    "personalises" : "personalizes",
    "personalising" : "personalizing",
    "pharmacopoeia" : "pharmacopeia",
    "pharmacopoeias" : "pharmacopeias",
    "philosophise" : "philosophize",
    "philosophised" : "philosophized",
    "philosophises" : "philosophizes",
    "philosophising" : "philosophizing",
    "philtre" : "filter",
    "philtres" : "filters",
    "phoney" : "phony",
    "plagiarise" : "plagiarize",
    "plagiarised" : "plagiarized",
    "plagiarises" : "plagiarizes",
    "plagiarising" : "plagiarizing",
    "plough" : "plow",
    "ploughed" : "plowed",
    "ploughing" : "plowing",
    "ploughman" : "plowman",
    "ploughmen" : "plowmen",
    "ploughs" : "plows",
    "ploughshare" : "plowshare",
    "ploughshares" : "plowshares",
    "polarisation" : "polarization",
    "polarise" : "polarize",
    "polarised" : "polarized",
    "polarises" : "polarizes",
    "polarising" : "polarizing",
    "politicisation" : "politicization",
    "politicise" : "politicize",
    "politicised" : "politicized",
    "politicises" : "politicizes",
    "politicising" : "politicizing",
    "popularisation" : "popularization",
    "popularise" : "popularize",
    "popularised" : "popularized",
    "popularises" : "popularizes",
    "popularising" : "popularizing",
    "pouffe" : "pouf",
    "pouffes" : "poufs",
    "practise" : "practice",
    "practised" : "practiced",
    "practises" : "practices",
    "practising" : "practicing",
    "praesidium" : "presidium",
    "praesidiums" : "presidiums",
    "pressurisation" : "pressurization",
    "pressurise" : "pressurize",
    "pressurised" : "pressurized",
    "pressurises" : "pressurizes",
    "pressurising" : "pressurizing",
    "pretence" : "pretense",
    "pretences" : "pretenses",
    "primaeval" : "primeval",
    "prioritisation" : "prioritization",
    "prioritise" : "prioritize",
    "prioritised" : "prioritized",
    "prioritises" : "prioritizes",
    "prioritising" : "prioritizing",
    "privatisation" : "privatization",
    "privatisations" : "privatizations",
    "privatise" : "privatize",
    "privatised" : "privatized",
    "privatises" : "privatizes",
    "privatising" : "privatizing",
    "professionalisation" : "professionalization",
    "professionalise" : "professionalize",
    "professionalised" : "professionalized",
    "professionalises" : "professionalizes",
    "professionalising" : "professionalizing",
    "programme" : "program",
    "programmes" : "programs",
    "prologue" : "prolog",
    "prologues" : "prologs",
    "propagandise" : "propagandize",
    "propagandised" : "propagandized",
    "propagandises" : "propagandizes",
    "propagandising" : "propagandizing",
    "proselytise" : "proselytize",
    "proselytised" : "proselytized",
    "proselytiser" : "proselytizer",
    "proselytisers" : "proselytizers",
    "proselytises" : "proselytizes",
    "proselytising" : "proselytizing",
    "psychoanalyse" : "psychoanalyze",
    "psychoanalysed" : "psychoanalyzed",
    "psychoanalyses" : "psychoanalyzes",
    "psychoanalysing" : "psychoanalyzing",
    "publicise" : "publicize",
    "publicised" : "publicized",
    "publicises" : "publicizes",
    "publicising" : "publicizing",
    "pulverisation" : "pulverization",
    "pulverise" : "pulverize",
    "pulverised" : "pulverized",
    "pulverises" : "pulverizes",
    "pulverising" : "pulverizing",
    "pummelled" : "pummel",
    "pummelling" : "pummeled",
    "pyjama" : "pajama",
    "pyjamas" : "pajamas",
    "pzazz" : "pizzazz",
    "quarrelled" : "quarreled",
    "quarrelling" : "quarreling",
    "radicalise" : "radicalize",
    "radicalised" : "radicalized",
    "radicalises" : "radicalizes",
    "radicalising" : "radicalizing",
    "rancour" : "rancor",
    "randomise" : "randomize",
    "randomised" : "randomized",
    "randomises" : "randomizes",
    "randomising" : "randomizing",
    "rationalisation" : "rationalization",
    "rationalisations" : "rationalizations",
    "rationalise" : "rationalize",
    "rationalised" : "rationalized",
    "rationalises" : "rationalizes",
    "rationalising" : "rationalizing",
    "ravelled" : "raveled",
    "ravelling" : "raveling",
    "realisable" : "realizable",
    "realisation" : "realization",
    "realisations" : "realizations",
    "realise" : "realize",
    "realised" : "realized",
    "realises" : "realizes",
    "realising" : "realizing",
    "recognisable" : "recognizable",
    "recognisably" : "recognizably",
    "recognisance" : "recognizance",
    "recognise" : "recognize",
    "recognised" : "recognized",
    "recognises" : "recognizes",
    "recognising" : "recognizing",
    "reconnoitre" : "reconnoiter",
    "reconnoitred" : "reconnoitered",
    "reconnoitres" : "reconnoiters",
    "reconnoitring" : "reconnoitering",
    "refuelled" : "refueled",
    "refuelling" : "refueling",
    "regularisation" : "regularization",
    "regularise" : "regularize",
    "regularised" : "regularized",
    "regularises" : "regularizes",
    "regularising" : "regularizing",
    "remodelled" : "remodeled",
    "remodelling" : "remodeling",
    "remould" : "remold",
    "remoulded" : "remolded",
    "remoulding" : "remolding",
    "remoulds" : "remolds",
    "reorganisation" : "reorganization",
    "reorganisations" : "reorganizations",
    "reorganise" : "reorganize",
    "reorganised" : "reorganized",
    "reorganises" : "reorganizes",
    "reorganising" : "reorganizing",
    "revelled" : "reveled",
    "reveller" : "reveler",
    "revellers" : "revelers",
    "revelling" : "reveling",
    "revitalise" : "revitalize",
    "revitalised" : "revitalized",
    "revitalises" : "revitalizes",
    "revitalising" : "revitalizing",
    "revolutionise" : "revolutionize",
    "revolutionised" : "revolutionized",
    "revolutionises" : "revolutionizes",
    "revolutionising" : "revolutionizing",
    "rhapsodise" : "rhapsodize",
    "rhapsodised" : "rhapsodized",
    "rhapsodises" : "rhapsodizes",
    "rhapsodising" : "rhapsodizing",
    "rigour" : "rigor",
    "rigours" : "rigors",
    "ritualised" : "ritualized",
    "rivalled" : "rivaled",
    "rivalling" : "rivaling",
    "romanticise" : "romanticize",
    "romanticised" : "romanticized",
    "romanticises" : "romanticizes",
    "romanticising" : "romanticizing",
    "rumour" : "rumor",
    "rumoured" : "rumored",
    "rumours" : "rumors",
    "sabre" : "saber",
    "sabres" : "sabers",
    "saltpetre" : "saltpeter",
    "sanitise" : "sanitize",
    "sanitised" : "sanitized",
    "sanitises" : "sanitizes",
    "sanitising" : "sanitizing",
    "satirise" : "satirize",
    "satirised" : "satirized",
    "satirises" : "satirizes",
    "satirising" : "satirizing",
    "saviour" : "savior",
    "saviours" : "saviors",
    "savour" : "savor",
    "savoured" : "savored",
    "savouries" : "savories",
    "savouring" : "savoring",
    "savours" : "savors",
    "savoury" : "savory",
    "scandalise" : "scandalize",
    "scandalised" : "scandalized",
    "scandalises" : "scandalizes",
    "scandalising" : "scandalizing",
    "sceptic" : "skeptic",
    "sceptical" : "skeptical",
    "sceptically" : "skeptically",
    "scepticism" : "skepticism",
    "sceptics" : "skeptics",
    "sceptre" : "scepter",
    "sceptres" : "scepters",
    "scrutinise" : "scrutinize",
    "scrutinised" : "scrutinized",
    "scrutinises" : "scrutinizes",
    "scrutinising" : "scrutinizing",
    "secularisation" : "secularization",
    "secularise" : "secularize",
    "secularised" : "secularized",
    "secularises" : "secularizes",
    "secularising" : "secularizing",
    "sensationalise" : "sensationalize",
    "sensationalised" : "sensationalized",
    "sensationalises" : "sensationalizes",
    "sensationalising" : "sensationalizing",
    "sensitise" : "sensitize",
    "sensitised" : "sensitized",
    "sensitises" : "sensitizes",
    "sensitising" : "sensitizing",
    "sentimentalise" : "sentimentalize",
    "sentimentalised" : "sentimentalized",
    "sentimentalises" : "sentimentalizes",
    "sentimentalising" : "sentimentalizing",
    "sepulchre" : "sepulcher",
    "sepulchres" : "sepulchers",
    "serialisation" : "serialization",
    "serialisations" : "serializations",
    "serialise" : "serialize",
    "serialised" : "serialized",
    "serialises" : "serializes",
    "serialising" : "serializing",
    "sermonise" : "sermonize",
    "sermonised" : "sermonized",
    "sermonises" : "sermonizes",
    "sermonising" : "sermonizing",
    "sheikh" : "sheik",
    "shovelled" : "shoveled",
    "shovelling" : "shoveling",
    "shrivelled" : "shriveled",
    "shrivelling" : "shriveling",
    "signalise" : "signalize",
    "signalised" : "signalized",
    "signalises" : "signalizes",
    "signalising" : "signalizing",
    "signalled" : "signaled",
    "signalling" : "signaling",
    "smoulder" : "smolder",
    "smouldered" : "smoldered",
    "smouldering" : "smoldering",
    "smoulders" : "smolders",
    "snivelled" : "sniveled",
    "snivelling" : "sniveling",
    "snorkelled" : "snorkeled",
    "snorkelling" : "snorkeling",
    "snowplough" : "snowplow",
    "snowploughs" : "snowplow",
    "socialisation" : "socialization",
    "socialise" : "socialize",
    "socialised" : "socialized",
    "socialises" : "socializes",
    "socialising" : "socializing",
    "sodomise" : "sodomize",
    "sodomised" : "sodomized",
    "sodomises" : "sodomizes",
    "sodomising" : "sodomizing",
    "solemnise" : "solemnize",
    "solemnised" : "solemnized",
    "solemnises" : "solemnizes",
    "solemnising" : "solemnizing",
    "sombre" : "somber",
    "specialisation" : "specialization",
    "specialisations" : "specializations",
    "specialise" : "specialize",
    "specialised" : "specialized",
    "specialises" : "specializes",
    "specialising" : "specializing",
    "spectre" : "specter",
    "spectres" : "specters",
    "spiralled" : "spiraled",
    "spiralling" : "spiraling",
    "splendour" : "splendor",
    "splendours" : "splendors",
    "squirrelled" : "squirreled",
    "squirrelling" : "squirreling",
    "stabilisation" : "stabilization",
    "stabilise" : "stabilize",
    "stabilised" : "stabilized",
    "stabiliser" : "stabilizer",
    "stabilisers" : "stabilizers",
    "stabilises" : "stabilizes",
    "stabilising" : "stabilizing",
    "standardisation" : "standardization",
    "standardise" : "standardize",
    "standardised" : "standardized",
    "standardises" : "standardizes",
    "standardising" : "standardizing",
    "stencilled" : "stenciled",
    "stencilling" : "stenciling",
    "sterilisation" : "sterilization",
    "sterilisations" : "sterilizations",
    "sterilise" : "sterilize",
    "sterilised" : "sterilized",
    "steriliser" : "sterilizer",
    "sterilisers" : "sterilizers",
    "sterilises" : "sterilizes",
    "sterilising" : "sterilizing",
    "stigmatisation" : "stigmatization",
    "stigmatise" : "stigmatize",
    "stigmatised" : "stigmatized",
    "stigmatises" : "stigmatizes",
    "stigmatising" : "stigmatizing",
    " storey" : " story",
    " storeys" : " stories",
    "subsidisation" : "subsidization",
    "subsidise" : "subsidize",
    "subsidised" : "subsidized",
    "subsidiser" : "subsidizer",
    "subsidisers" : "subsidizers",
    "subsidises" : "subsidizes",
    "subsidising" : "subsidizing",
    "succour" : "succor",
    "succoured" : "succored",
    "succouring" : "succoring",
    "succours" : "succors",
    "sulphate" : "sulfate",
    "sulphates" : "sulfates",
    "sulphide" : "sulfide",
    "sulphides" : "sulfides",
    "sulphur" : "sulfur",
    "sulphurous" : "sulfurous",
    "summarise" : "summarize",
    "summarised" : "summarized",
    "summarises" : "summarizes",
    "summarising" : "summarizing",
    "swivelled" : "swiveled",
    "swivelling" : "swiveling",
    "symbolise" : "symbolize",
    "symbolised" : "symbolized",
    "symbolises" : "symbolizes",
    "symbolising" : "symbolizing",
    "sympathise" : "sympathize",
    "sympathised" : "sympathized",
    "sympathiser" : "sympathizer",
    "sympathisers" : "sympathizers",
    "sympathises" : "sympathizes",
    "sympathising" : "sympathizing",
    "synchronisation" : "synchronization",
    "synchronise" : "synchronize",
    "synchronised" : "synchronized",
    "synchronises" : "synchronizes",
    "synchronising" : "synchronizing",
    "synthesise" : "synthesize",
    "synthesised" : "synthesized",
    "synthesiser" : "synthesizer",
    "synthesisers" : "synthesizers",
    "synthesises" : "synthesizes",
    "synthesising" : "synthesizing",
    "syphon" : "siphon",
    "syphoned" : "siphoned",
    "syphoning" : "siphoning",
    "syphons" : "siphons",
    "systematisation" : "systematization",
    "systematise" : "systematize",
    "systematised" : "systematized",
    "systematises" : "systematizes",
    "systematising" : "systematizing",
    "tantalise" : "tantalize",
    "tantalised" : "tantalized",
    "tantalises" : "tantalizes",
    "tantalising" : "tantalizing",
    "tantalisingly" : "tantalizingly",
    "tasselled" : "tasseled",
    "technicolour" : "technicolor",
    "temporise" : "temporize",
    "temporised" : "temporized",
    "temporises" : "temporizes",
    "temporising" : "temporizing",
    "tenderise" : "tenderize",
    "tenderised" : "tenderized",
    "tenderises" : "tenderizes",
    "tenderising" : "tenderizing",
    "terrorise" : "terrorize",
    "terrorised" : "terrorized",
    "terrorises" : "terrorizes",
    "terrorising" : "terrorizing",
    "theatre" : "theater",
    "theatregoer" : "theatergoer",
    "theatregoers" : "theatergoers",
    "theatres" : "theaters",
    "theorise" : "theorize",
    "theorised" : "theorized",
    "theorises" : "theorizes",
    "theorising" : "theorizing",
    "tonne" : "ton",
    "tonnes" : "tons",
    "towelled" : "toweled",
    "towelling" : "toweling",
    "toxaemia" : "toxemia",
    "tranquillise" : "tranquilize",
    "tranquillised" : "tranquilized",
    "tranquilliser" : "tranquilizer",
    "tranquillisers" : "tranquilizers",
    "tranquillises" : "tranquilizes",
    "tranquillising" : "tranquilizing",
    "tranquillity" : "tranquility",
    "tranquillize" : "tranquilize",
    "tranquillized" : "tranquilized",
    "tranquillizer" : "tranquilizer",
    "tranquillizers" : "tranquilizers",
    "tranquillizes" : "tranquilizes",
    "tranquillizing" : "tranquilizing",
    "tranquilly" : "tranquility",
    "transistorised" : "transistorized",
    "traumatise" : "traumatize",
    "traumatised" : "traumatized",
    "traumatises" : "traumatizes",
    "traumatising" : "traumatizing",
    "travelled" : "traveled",
    "traveller" : "traveler",
    "travellers" : "travelers",
    "travelling" : "traveling",
    "travelogue" : "travelog",
    "travelogues" : "travelogs",
    "trialled" : "trialed",
    "trialling" : "trialing",
    "tricolour" : "tricolor",
    "tricolours" : "tricolors",
    "trivialise" : "trivialize",
    "trivialised" : "trivialized",
    "trivialises" : "trivializes",
    "trivialising" : "trivializing",
    "tumour" : "tumor",
    "tumours" : "tumors",
    "tunnelled" : "tunneled",
    "tunnelling" : "tunneling",
    "tyrannise" : "tyrannize",
    "tyrannised" : "tyrannized",
    "tyrannises" : "tyrannizes",
    "tyrannising" : "tyrannizing",
    " tyre" : " tire",
    " tyres" : " tires",
    "unauthorised" : "unauthorized",
    "uncivilised" : "uncivilized",
    "underutilised" : "underutilized",
    "unequalled" : "unequaled",
    "unfavourable" : "unfavorable",
    "unfavourably" : "unfavorably",
    "unionisation" : "unionization",
    "unionise" : "unionize",
    "unionised" : "unionized",
    "unionises" : "unionizes",
    "unionising" : "unionizing",
    "unorganised" : "unorganized",
    "unravelled" : "unraveled",
    "unravelling" : "unraveling",
    "unrecognisable" : "unrecognizable",
    "unrecognised" : "unrecognized",
    "unrivalled" : "unrivaled",
    "unsavoury" : "unsavory",
    "untrammelled" : "untrammeled",
    "urbanisation" : "urbanization",
    "urbanise" : "urbanize",
    "urbanised" : "urbanized",
    "urbanises" : "urbanizes",
    "urbanising" : "urbanizing",
    "utilisable" : "utilizable",
    "utilisation" : "utilization",
    "utilise" : "utilize",
    "utilised" : "utilized",
    "utilises" : "utilizes",
    "utilising" : "utilizing",
    "valour" : "valor",
    "vandalise" : "vandalize",
    "vandalised" : "vandalized",
    "vandalises" : "vandalizes",
    "vandalising" : "vandalizing",
    "vaporisation" : "vaporization",
    "vaporise" : "vaporize",
    "vaporised" : "vaporized",
    "vaporises" : "vaporizes",
    "vaporising" : "vaporizing",
    "vapour" : "vapor",
    "vapours" : "vapors",
    "verbalise" : "verbalize",
    "verbalised" : "verbalized",
    "verbalises" : "verbalizes",
    "verbalising" : "verbalizing",
    "victimisation" : "victimization",
    "victimise" : "victimize",
    "victimised" : "victimized",
    "victimises" : "victimizes",
    "victimising" : "victimizing",
    "videodisc" : "videodisk",
    "videodiscs" : "videodisks",
    "vigour" : "vigor",
    "visualisation" : "visualization",
    "visualisations" : "visualizations",
    "visualise" : "visualize",
    "visualised" : "visualized",
    "visualises" : "visualizes",
    "visualising" : "visualizing",
    "vocalisation" : "vocalization",
    "vocalisations" : "vocalizations",
    "vocalise" : "vocalize",
    "vocalised" : "vocalized",
    "vocalises" : "vocalizes",
    "vocalising" : "vocalizing",
    "vulcanised" : "vulcanized",
    "vulgarisation" : "vulgarization",
    "vulgarise" : "vulgarize",
    "vulgarised" : "vulgarized",
    "vulgarises" : "vulgarizes",
    "vulgarising" : "vulgarizing",
    "waggon" : "wagon",
    "waggons" : "wagons",
    "watercolour" : "watercolor",
    "watercolours" : "watercolors",
    "weaselled" : "weaseled",
    "weaselling" : "weaseling",
    "westernisation" : "westernization",
    "westernise" : "westernize",
    "westernised" : "westernized",
    "westernises" : "westernizes",
    "westernising" : "westernizing",
    "womanise" : "womanize",
    "womanised" : "womanized",
    "womaniser" : "womanizer",
    "womanisers" : "womanizers",
    "womanises" : "womanizes",
    "womanising" : "womanizing",
    "woollen" : "woolen",
    "woollens" : "woolens",
    "woollies" : "woolies",
    "woolly" : "wooly",
    "worshipped" : "worshiped",
    "worshipping" : "worshiping",
    "worshipper" : "worshiper",
    "yodelled" : "yodeled",
    "yodelling" : "yodeling",
    "yoghourt" : "yogurt",
    "yoghourts" : "yogurts",
    "yoghurt" : "yogurt",
    "yoghurts" : "yogurts"}

In [20]:
def fromBritishToAmerican(sentence):
    for gb, us in normalized_dict.items():
        sentence = sentence.replace(gb, us)
    return sentence

def fromAmericanToBritish(sentence):
    for us, gb in normalized_dict.items():
        sentence = sentence.replace(us, gb)
    return sentence

# Normalize all text to american
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: fromBritishToAmerican(x))

CPU times: user 7.47 s, sys: 13.7 ms, total: 7.49 s
Wall time: 7.51 s


In [22]:
# check reduction in length
curr_len = getAvgLen(df['normalized_en'])
print("Reduction in average length: ", ((avg_len_beg - curr_len) / avg_len_beg) * 100, "%")

Reduction in average length:  3.980712561000906 %


### 4.12 Check spelling - NOT WORKING CURRENTLY ###

In [23]:
#def spellchecker(sentence):
#    try:
#        d = enchant.Dict("en_US")
#    except ImportError:
#        print ("Enchant Library Not Found. Spell Checking Failed.")
#        return sentence
#    options = []
#    newt = ""
#    ccount = 0
#    fail = "no"
#    if sentence:
#        for word in sentence.split(" "):
#            if d.check(word) is True:
#                newt = newt + word + " "
#            else:
#                clist = d.suggest(word)
#                word = clist[ccount]
#                newt = newt + word + " "
#                fail = "yes"
#        return newt 

#%time df['normalized_en'] = df['normalized_en'].apply(lambda x: spellchecker(x))

#print("\n")
#print(df['normalized_en'][line])

In [24]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 3.980712561000906%


In [25]:
# Fix spelling - DOES NOT WORK PROPERLY
#dictionary = enchant.Dict("en_US")
#for index in range(len(df['normalized_en'])):
#    sentence = df['normalized_en'][index]
    
#    words = nltk.word_tokenize(sentence)
    
#    df['normalized_en'] = " ".join([word for word in words if dictionary.check(word)])
    
    #df['normalized_en'][index] = sen
    
#    if (index % 1000 == 0):
#        print("done", index)
        
#print("\n")
#print(df['normalized_en'][line])

In [26]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 3.980712561000906%


### 4.13 Remove proper nouns ###

In [27]:
# Define function to remove proper nouns (has to be done before converting to Lower Case)
def remove_prop_nouns(sentence):
    tagged_sentence = nltk.tag.pos_tag(sentence.split())
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    
    return " ".join(edited_sentence)

# Remove proper nouns from all sentences
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: remove_prop_nouns(x))

print("\n")
print(df['normalized_en'][line])

CPU times: user 45.1 s, sys: 293 ms, total: 45.4 s
Wall time: 45.7 s


who was born in is a 22 year-old single woman who resides with her parents. She runs a small gift shop in the de that is known for its selection of perfumes. She has been in business for 5 years. morning from 6:00 am until 2:00 pm and her mother operate their shop. wants this loan to buy more products and diversify her stock. This hard-working young woman not only runs this business; she is wrapping up her studies at the in


In [29]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 12.243033080394007%


### 4.14 Lemmatization to get to root words ###

In [30]:
# Define function to lemmatize each word with its POS tag
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

# Lemmatize all sentences
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: lemmatize_with_postag(x))

print("\n")
print(df['normalized_en'][line])

CPU times: user 1min 9s, sys: 1.08 s, total: 1min 10s
Wall time: 1min 10s


who be bear in be a 22 year-old single woman who reside with her parent She run a small gift shop in the de that be know for it selection of perfume She have be in business for 5 year morning from 6:00 be until 2:00 pm and her mother operate their shop want this loan to buy more product and diversify her stock This hard-working young woman not only run this business she be wrap up her study at the in


In [32]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 17.209030090518105%


### 4.15 Convert to lowercase ###

In [33]:
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: " ".join(x.lower() for x in x.split()))

print("\n")
print(df['normalized_en'][line])

CPU times: user 283 ms, sys: 4.34 ms, total: 287 ms
Wall time: 287 ms


who be bear in be a 22 year-old single woman who reside with her parent she run a small gift shop in the de that be know for it selection of perfume she have be in business for 5 year morning from 6:00 be until 2:00 pm and her mother operate their shop want this loan to buy more product and diversify her stock this hard-working young woman not only run this business she be wrap up her study at the in


In [34]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 17.209030090518105%


### 4.16 Replace common contractions ###

In [35]:
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"i'm", "i am", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"he's", "he is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"she's", "she is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"that's", "that is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"what's", "what is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"here's", "here is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"where's", "where is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"there's", "there is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"who's", "who is", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"\'ll", " will", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"\'ve", " have", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"\'re", " are", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"\'d", " would", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"won't", "will not", x, flags=re.DOTALL))
df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"can't", "cannot", x, flags=re.DOTALL))

print(df['normalized_en'][line])

who be bear in be a 22 year-old single woman who reside with her parent she run a small gift shop in the de that be know for it selection of perfume she have be in business for 5 year morning from 6:00 be until 2:00 pm and her mother operate their shop want this loan to buy more product and diversify her stock this hard-working young woman not only run this business she be wrap up her study at the in


In [36]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 17.20501880412791%


### 4.17 Remove punctuation ###

In [37]:
%time df['normalized_en'] = df['normalized_en'].str.replace('[^\w\s]', ' ')

print("\n")
print(df['normalized_en'][line])

CPU times: user 111 ms, sys: 2.33 ms, total: 114 ms
Wall time: 113 ms


who be bear in be a 22 year old single woman who reside with her parent she run a small gift shop in the de that be know for it selection of perfume she have be in business for 5 year morning from 6 00 be until 2 00 pm and her mother operate their shop want this loan to buy more product and diversify her stock this hard working young woman not only run this business she be wrap up her study at the in


In [38]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 17.20501880412791%


### 4.18 Remove numerics ###

In [39]:
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub("(^|\W)\d+($|\W)", " ", x, flags=re.DOTALL))
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub("[0-9]+", " ", x, flags=re.DOTALL))

print("\n")
print(df['normalized_en'][line])

CPU times: user 278 ms, sys: 2.83 ms, total: 281 ms
Wall time: 280 ms
CPU times: user 127 ms, sys: 262 µs, total: 127 ms
Wall time: 127 ms


who be bear in be a year old single woman who reside with her parent she run a small gift shop in the de that be know for it selection of perfume she have be in business for year morning from   be until   pm and her mother operate their shop want this loan to buy more product and diversify her stock this hard working young woman not only run this business she be wrap up her study at the in


In [40]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 18.033532886678227%


### 4.19 Remove unicode characters ###

In [41]:
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: unidecode.unidecode(x))

print("\n")
print(df['normalized_en'][line])

CPU times: user 8.72 ms, sys: 175 µs, total: 8.9 ms
Wall time: 8.81 ms


who be bear in be a year old single woman who reside with her parent she run a small gift shop in the de that be know for it selection of perfume she have be in business for year morning from   be until   pm and her mother operate their shop want this loan to buy more product and diversify her stock this hard working young woman not only run this business she be wrap up her study at the in


In [42]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 18.033532886678227%


### 4.20 Remove stop words ###

In [43]:
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

print("\n")
print(df['normalized_en'][line])

CPU times: user 1.71 s, sys: 5.98 ms, total: 1.71 s
Wall time: 1.72 s


bear year old single woman reside parent run small gift shop de know selection perfume business year morning pm mother operate shop want loan buy product diversify stock hard working young woman run business wrap study


In [44]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 47.89955347533939%


### 4.21 Remove rare words

In [45]:
# count the number of words and store it using a dict
word_dict = {}
for index in range(len(df['normalized_en'])):
    sentence = df['normalized_en'][index]
    tokens = nltk.tokenize.word_tokenize(sentence)
    for token in tokens:
        if token in word_dict.keys():
            word_dict[token] = word_dict[token] + 1
        else:
            word_dict[token] = 1
word_dict

{'married': 1470,
 'child': 9485,
 'addition': 253,
 'family': 4540,
 'take': 2112,
 'care': 1099,
 'mother': 2234,
 'brother': 361,
 'start': 4143,
 'plant': 250,
 'vegetable': 924,
 'sell': 8878,
 'local': 1472,
 'market': 1801,
 'diversify': 328,
 'tea': 137,
 'nursery': 45,
 'profitable': 186,
 'loan': 14841,
 'able': 4517,
 'improve': 1667,
 'activity': 746,
 'buy': 6265,
 'fertilizer': 352,
 'pesticide': 51,
 'pump': 53,
 'seedling': 40,
 'remove': 133,
 'uncertainty': 1,
 'weather': 42,
 'current': 276,
 'capital': 1834,
 'venture': 234,
 'allow': 994,
 'maximize': 46,
 'potential': 292,
 'complete': 390,
 'high': 1513,
 'school': 5272,
 'never': 223,
 'get': 2104,
 'employment': 196,
 'apprentice': 14,
 'train': 350,
 'extension': 17,
 'officer': 110,
 'go': 1724,
 'getter': 4,
 'main': 730,
 'hobby': 14,
 'teach': 326,
 'music': 34,
 'desert': 5,
 'husband': 3470,
 'responsible': 397,
 'upbringing': 3,
 'two': 3747,
 'without': 247,
 'support': 2260,
 'parent': 764,
 'die': 16

In [46]:
# Get all the words which appears under a set threshold
rareWords = []
threshold = 10
for word in word_dict:
    if word_dict[word] < threshold:
        rareWords.append(word)
rareWords

['uncertainty',
 'getter',
 'desert',
 'upbringing',
 'samaritan',
 'admirable',
 'banking',
 'profitablility',
 'bet',
 'creation',
 'super',
 'sponsored',
 'aha',
 'stocking',
 'fondly',
 'sponsorship',
 'ably',
 'interpret',
 'fancy',
 'compliant',
 'stretch',
 'dramatic',
 'somebody',
 'wheelbarrow',
 'donkey',
 'hail',
 'vegetables',
 'custormers',
 'capability',
 'mobilizes',
 'sphere',
 'feeding',
 'triple',
 'collar',
 'basically',
 'fetching',
 'bonus',
 'annum',
 'bran',
 'molluses',
 'deworming',
 'maintaining',
 'hectic',
 'wet',
 'pilau',
 'masala',
 'ginger',
 'payphone',
 'undergo',
 'topic',
 'taker',
 'circle',
 'organizational',
 'lilian',
 'skills',
 'temporaryily',
 'empowers',
 'upright',
 'tedious',
 'prolong',
 'uptousd',
 'spraying',
 'transporting',
 'religiuou',
 'wither',
 'saami',
 'cultural',
 'limitations',
 'w',
 'danger',
 'arrange',
 'inthe',
 'commiity',
 'mary',
 'suppliment',
 'pasture',
 'setting',
 'foendr',
 'grioup',
 'youngones',
 'convinced',
 

In [47]:
#description = description.apply(lambda text: " ".join(word for word in text.split() if word not in rareWords))
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: " ".join(x for x in x.split() if x not in rareWords))

print("\n")
print(df['normalized_en'][line])

CPU times: user 45.9 s, sys: 75.7 ms, total: 46 s
Wall time: 46.1 s


bear year old single woman reside parent run small gift shop de know selection perfume business year morning pm mother operate shop want loan buy product diversify stock hard working young woman run business wrap study


In [48]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 49.80333690109538%


### 4.22 Remove frequent words ###

In [49]:
# Get all the words which appears under a set threshold
freqWords = []
threshold = 1000
for word in word_dict:
    if word_dict[word] > threshold:
        freqWords.append(word)
freqWords

['married',
 'child',
 'family',
 'take',
 'care',
 'mother',
 'start',
 'sell',
 'local',
 'market',
 'loan',
 'able',
 'improve',
 'buy',
 'capital',
 'high',
 'school',
 'get',
 'go',
 'husband',
 'two',
 'support',
 'ago',
 'good',
 'would',
 'land',
 'lack',
 'fee',
 'marry',
 'business',
 'customer',
 'woman',
 'group',
 'increase',
 'stock',
 'new',
 'product',
 'time',
 'give',
 'wife',
 'grow',
 'help',
 'make',
 'money',
 'keep',
 'purchase',
 'profit',
 'also',
 'hard',
 'work',
 'year',
 'member',
 'use',
 'clothes',
 'receive',
 'first',
 'pay',
 'operate',
 'like',
 'three',
 'income',
 'milk',
 'cow',
 'plan',
 'well',
 'repay',
 'water',
 'shop',
 'want',
 'age',
 'demand',
 'community',
 'supply',
 'one',
 'farm',
 'sale',
 'old',
 'day',
 'store',
 'service',
 'month',
 'area',
 'single',
 'young',
 'life',
 'client',
 'people',
 'offer',
 'enable',
 'large',
 'need',
 'since',
 'hop',
 'small',
 'living',
 'four',
 'many',
 'training',
 'live',
 'run',
 'home',
 'pro

In [50]:
#description = description.apply(lambda text: " ".join(word for word in text.split() if word not in freqWords))
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: " ".join(x for x in x.split() if x not in freqWords))

print("\n")
print(df['normalized_en'][line])

CPU times: user 653 ms, sys: 11.5 ms, total: 664 ms
Wall time: 693 ms


bear reside parent gift know selection perfume morning pm diversify working wrap study


In [51]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 73.31881850983646%


### 4.23 Remove whitespace characters ###

In [52]:
%time df['normalized_en'] = df['normalized_en'].apply(lambda x: re.sub(r"\s+", " ", x))

print("\n")
print(df['normalized_en'][line])

CPU times: user 118 ms, sys: 3.31 ms, total: 121 ms
Wall time: 125 ms


bear reside parent gift know selection perfume morning pm diversify working wrap study


In [53]:
# Get the average length of each description
curr_len = getAvgLen(df['normalized_en'])

print("Reduction in average description length: {}%".format(((avg_len_beg - curr_len) / avg_len_beg) * 100))

Reduction in average description length: 73.31881850983646%


### 4.24 Final text cleanup statistics ###

In [54]:
# Get the average length of each normalized text field
length = []

for sentence in df['normalized_en']:
    length.append(len(sentence))
avg_len_end = np.mean(length)

print("Average description length after pre-processing: {}".format(round(avg_len_end), 2))

Average description length after pre-processing: 273.0


In [55]:
# Difference in average length before and after pre-processing
print("Average description length difference: {}".format(round(avg_len_beg - avg_len_end),2))

Average description length difference: 751.0


In [56]:
# Difference in average length before and after pre-processing
print("Reduced average description length: {}%".format(((avg_len_beg - avg_len_end) / avg_len_beg) * 100))

Reduced average description length: 73.31881850983646%


### 4.25 Save processed text file ###

In [57]:
# save the updated dataframe back
df.to_csv("../data/processed/4.0-gg-processed-text-data.csv", index=False)