In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import re
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# Instantiate stopWords
stopWords = stopwords.words("english")
    
# Instantiate wordnet lemmatizer
wn = nltk.WordNetLemmatizer()

from sklearn.decomposition import PCA

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
"""
Desc: Standard preprocessing of query
Input: query (string) - text to be cleaned
Output: query (list of string) - cleaned text
"""
def preproc_query(query):
    
    # Lower case
    query = query.lower()

    # Remove unicode characters (emojis, etc.)
    query = query.encode('ascii', 'ignore').decode('utf-8')

    # Reduce repeated letters
    query= re.sub(re.compile(r"(.)\1{2,}"), r"\1\1", query)

    # Remove stop words
    pat = r'\b(?:{})\b'.format('|'.join(stopWords))
    query = re.sub(pat, '', query)

    # Remove punctuation
    query = re.sub('[%s]' % re.escape(string.punctuation), ' ', query)

    # Remove stop words again (in case stop word was next to punctuation)
    query = re.sub(pat, '', query)

    # Remove extra blank spaces
    query = re.sub(r'\s{2,}', ' ', query)

    # Lemmatize and tokenize
    query = [wn.lemmatize(word) for word in query.split()]
    
    return query

In [3]:
# Load data
df = pd.read_csv("kingston_clean.csv", usecols=["url", "clean_text"])

# Display
display(df.head())
print(len(df.index))

Unnamed: 0,url,clean_text
0,https://www.cityofkingston.ca/,home city kingston navigation skip content sk...
1,https://www.cityofkingston.ca/residents/commun...,poverty reduction city kingston navigation sk...
2,https://www.cityofkingston.ca/residents/commun...,ontario work city kingston navigation skip co...
3,https://www.cityofkingston.ca/residents/commun...,municipal fee assistance program city kingsto...
4,https://www.cityofkingston.ca/residents/commun...,housing city kingston navigation skip content...


10689


In [4]:
# Build bag of words and tokens
text_docs = [TaggedDocument(doc.split(' '), [i]) 
             for i, doc in enumerate(df["clean_text"])]

In [5]:
%%time

# Instantiate model
model = Doc2Vec(vector_size=64, window=2, min_count=1, workers=8, epochs=40)

# Build vocab
model.build_vocab(text_docs)

# Train model
model.train(text_docs, total_examples=model.corpus_count, epochs=model.epochs)

# Generate vectors
text2vec = [model.infer_vector((df['clean_text'][i].split(' '))) for i in range(0,len(df['clean_text']))]

# Set list to dataframe column
df['text_vec'] = np.array(text2vec).tolist()

# Display
display(df.head())

Unnamed: 0,url,clean_text,text_vec
0,https://www.cityofkingston.ca/,home city kingston navigation skip content sk...,"[1.3450957536697388, -0.1454906463623047, -0.0..."
1,https://www.cityofkingston.ca/residents/commun...,poverty reduction city kingston navigation sk...,"[1.7279950380325317, 0.807035505771637, -1.771..."
2,https://www.cityofkingston.ca/residents/commun...,ontario work city kingston navigation skip co...,"[2.4424219131469727, 3.1754353046417236, 1.267..."
3,https://www.cityofkingston.ca/residents/commun...,municipal fee assistance program city kingsto...,"[0.7618685960769653, 1.3048862218856812, 1.570..."
4,https://www.cityofkingston.ca/residents/commun...,housing city kingston navigation skip content...,"[1.2927143573760986, 0.23918530344963074, 0.44..."


Wall time: 5min 42s


In [6]:
# Save model
model.save("doc2vec_kingston")

In [7]:
# Convert string to vec
query = "Queen's university"

query = preproc_query(query)

query = model.infer_vector(query)

# Convert to np array
query = np.array(query)

print(query)

[ 0.31971943 -0.13755491 -0.17359641 -0.42708182 -0.05357218 -0.1644662
  0.25183767  0.3060779   0.23542143  0.38607714 -0.21343933 -0.24547648
 -0.2937509   0.00665796  0.03709129 -0.07275939  0.26697975  0.38631687
 -0.08295317 -0.15205279 -0.19144835 -0.00707507  0.09481531 -0.09329452
  0.31234917 -0.52970743  0.10058472 -0.25680634 -0.20257765  0.09781083
 -0.2603708  -0.23708706 -0.39939818 -0.12630107  0.32891896 -0.1431428
  0.20999627 -0.22820976  0.06280139 -0.19122955  0.03298479 -0.15545866
 -0.1962802   0.14430775 -0.18865132 -0.22804849 -0.11102099  0.02391251
  0.02678386 -0.31591177  0.01975598 -0.39699438 -0.45853597 -0.20703813
 -0.15655391 -0.36821085  0.31201756 -0.13861892 -0.08253559 -0.5097666
 -0.17030211 -0.1944427  -0.1106058  -0.14506228]


In [8]:
# Compute cosine similarity between query and text vectors
df["cos_dist"] = df["text_vec"].apply(lambda x: np.dot(query, np.array(x)) / 
                                      (np.linalg.norm(query) * np.linalg.norm(np.array(x))))

# Display
display(df.head())

Unnamed: 0,url,clean_text,text_vec,cos_dist
0,https://www.cityofkingston.ca/,home city kingston navigation skip content sk...,"[1.3450957536697388, -0.1454906463623047, -0.0...",0.626713
1,https://www.cityofkingston.ca/residents/commun...,poverty reduction city kingston navigation sk...,"[1.7279950380325317, 0.807035505771637, -1.771...",0.388114
2,https://www.cityofkingston.ca/residents/commun...,ontario work city kingston navigation skip co...,"[2.4424219131469727, 3.1754353046417236, 1.267...",0.295472
3,https://www.cityofkingston.ca/residents/commun...,municipal fee assistance program city kingsto...,"[0.7618685960769653, 1.3048862218856812, 1.570...",0.368819
4,https://www.cityofkingston.ca/residents/commun...,housing city kingston navigation skip content...,"[1.2927143573760986, 0.23918530344963074, 0.44...",0.450808


In [9]:
# Sort df
df = df.sort_values(by=["cos_dist"], ascending=False, ignore_index=True)

# Take top N results
df = df.head(100)

# Display
display(df)

Unnamed: 0,url,clean_text,text_vec,cos_dist
0,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.1326957494020462, 0.0041216714307665825, -0...",0.950235
1,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.1448322981595993, -0.026491964235901833, -0...",0.946684
2,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.15238989889621735, -0.011413750238716602, -...",0.943909
3,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.22128915786743164, -0.028457531705498695, -...",0.943192
4,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.18223093450069427, -0.03193990886211395, -0...",0.941597
...,...,...,...,...
95,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.13631120324134827, -0.004840785637497902, -...",0.929740
96,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.1413699984550476, -0.012072962708771229, -0...",0.929726
97,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.15048570930957794, -0.01315146591514349, -0...",0.929601
98,https://www.cityofkingston.ca/documents/10180/...,city kingston file available file available f...,"[0.16932696104049683, 0.010494407266378403, -0...",0.929357


In [10]:
for i in df["url"]:
    print(i)

https://www.cityofkingston.ca/documents/10180/7104160/COU_A3114-14317.pdf/36e1fb90-0695-4088-b3c6-60581399ea58
https://www.cityofkingston.ca/documents/10180/745955/COU_A1012-12103.pdf/705819eb-2414-40c0-8a75-8098d4f89d7d
https://www.cityofkingston.ca/documents/10180/108884/COU_A0413-13060.pdf/00ef3cf0-e56e-4e9f-ba8c-02d62f8fbed7
https://www.cityofkingston.ca/documents/10180/12823829/COU_A0716-ARCP31.pdf/9bbfe8ad-d51f-47f7-a242-c8d2c441e9aa
https://www.cityofkingston.ca/documents/10180/7589760/COU_A0315-15014.pdf/7857d22c-b8d6-45e2-b0ab-a370c0e0ad85
https://www.cityofkingston.ca/documents/10180/2574273/EIT_A0913-SchedA.pdf/07e98bea-fe5a-45bc-84ac-60eefe28a666
https://www.cityofkingston.ca/documents/10180/10419038/COU_A1915-15339.pdf/ce22c1e4-01ff-484c-8cdb-6f06c416a662
https://www.cityofkingston.ca/documents/10180/13695566/PLN_A0916-16031.pdf/10e5e93a-23cd-43a4-b9ee-015e9db8fb40
https://www.cityofkingston.ca/documents/10180/561782/COU_A11-13151.pdf/db41ea6e-a663-49b6-9d3e-8f9f63b49609
h

In [11]:
# Perform PCA to reduce dimensions to 2
# Initialize PCA
pca = PCA(2)
 
# Transform the data
df_vec = pd.DataFrame(df["text_vec"].to_list())

# Convert to dataframe
df_vec = pd.DataFrame(pca.fit_transform(df_vec), columns=['pca_x', 'pca_y'])

# Normalize pca dimensions to [0, 1]
df_vec["pca_x"] = (df_vec["pca_x"] - df_vec["pca_x"].min()) / (df_vec["pca_x"].max() - df_vec["pca_x"].min()) 
df_vec["pca_y"] = (df_vec["pca_y"] - df_vec["pca_y"].min()) / (df_vec["pca_y"].max() - df_vec["pca_y"].min())

# Concatenate df_vec to df
df = pd.concat([df, df_vec], axis=1)

# Select columns
df = df[["url", "pca_x", "pca_y", "cos_dist"]]

# Display
display(df)

Unnamed: 0,url,pca_x,pca_y,cos_dist
0,https://www.cityofkingston.ca/documents/10180/...,0.396034,0.279686,0.950235
1,https://www.cityofkingston.ca/documents/10180/...,0.259847,0.470561,0.946684
2,https://www.cityofkingston.ca/documents/10180/...,0.411407,0.383228,0.943909
3,https://www.cityofkingston.ca/documents/10180/...,0.921991,0.406916,0.943192
4,https://www.cityofkingston.ca/documents/10180/...,0.690915,0.559389,0.941597
...,...,...,...,...
95,https://www.cityofkingston.ca/documents/10180/...,0.413022,0.573016,0.929740
96,https://www.cityofkingston.ca/documents/10180/...,0.465175,0.692867,0.929726
97,https://www.cityofkingston.ca/documents/10180/...,0.155057,0.356693,0.929601
98,https://www.cityofkingston.ca/documents/10180/...,0.246449,0.302546,0.929357


In [12]:
# Save final result to csv
df.to_csv("query_results.csv", index=False)