# Imports

In [2]:
# Third-Party Imports
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
import torch
from sentence_transformers import SentenceTransformer, util

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()

# Standard Library Imports
import os
import sys
from math import inf

# Local Imports
from queries import get_text_cli
from get_documents import search
from get_documents import article_id

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bhekimaenetja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Utility Functions

In [3]:
def get_text(term=None):
    if not term:
        term = get_text_cli('Enter a search term')
    return search(term)

def load_docs(dirname='corpus'):
    corpus = dict()
    main_path = os.path.join(os.path.dirname('__file__'), dirname)

    for file in os.listdir(main_path):
        with open(os.path.join(main_path, file), 'r') as f:
            id_and_name = tuple(file.split('.')[0].split('-'))
            corpus[id_and_name] = f.read()
    
    return corpus

def chunk_text(text, chunk_len):
    chunks = []
    current_chunk = ""
    sents = nltk.sent_tokenize(text)
    
    for sent in sents:
        if len(nltk.word_tokenize(current_chunk + f" {sent}")) >= chunk_len:
            chunks.append(current_chunk)
            current_chunk = ""
        else:
            current_chunk += f" {sent}"
    
    chunks.append(current_chunk)
    
    return chunks

def cosine_similarity(text_1, text_2, model=None):
    if not model:
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    embedding_1 = model.encode(text_1, convert_to_tensor=True)
    embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
    return float(util.pytorch_cos_sim(embedding_1, embedding_2))

In [4]:
# Plotting functions
def plot_data(x=None, y=None, z=None, title="", x_label="", y_label="", name="", mode="markers", text="", **traces):
    fig = go.Figure(layout={
        "title": title,
        "xaxis": {"title": x_label},
        "yaxis": {"title": y_label}
    })
    
    if z is None:
        data = go.Scatter(
            x=x,
            y=y,
            mode=mode,
            name=name,
            text=text
        )
    else:
        data = go.Scatter3d(
            x=x,
            y=y,
            z=z,
            mode=mode,
            name=name,
            text=text
        )

    if x is not None and y is not None:
        fig.add_trace(data)
    
    for t in traces:
        fig.add_trace(traces[t])
    
    return fig

def create_trace(x=None, y=None, z=None, name="", mode="lines", text="", marker_size=None):
    if z is None:
        trace = go.Scatter(
            x=x,
            y=y,
            mode=mode,
            name=name,
            text=text,
            marker=dict(size=marker_size)
        )
    else:
        trace = go.Scatter3d(
            x=x,
            y=y,
            z=z,
            mode=mode,
            name=name,
            text=text,
            marker=dict(size=marker_size)
        )
    
    return trace

# WikiBot Query Parsing

In [5]:
import wikipediaapi
from string import punctuation

## Getting Searchable Entites

In [6]:
def get_named_entities(query):
    # Intialise nlp model
    nlp = spacy.load("en_core_web_sm")
    
    # Get entities from queries
    doc = nlp(query)
    entities = { ent.text for ent in doc.ents }
    return entities

def word_tokenize(text, lower_case=False):
    banned = list(punctuation) + nltk.corpus.stopwords.words("english")
    
    if lower_case:
        return [
        w.lower() for w in nltk.word_tokenize(text) 
        if w.lower() not in banned
    ]
    
    return [
        w for w in nltk.word_tokenize(text) 
        if w.lower() not in banned
    ]

def get_improper_nouns(query):
    lemma = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(word_tokenize(query))
    return {
        lemma.lemmatize(tag[0]).lower() 
        for tag in pos_tags 
        if tag[-1] in ("NN", "NNS")
    }

def searchable_entities(query):
    improper_nouns = get_improper_nouns(query)
    named_entities = get_named_entities(query)
    return improper_nouns.union(named_entities)

In [7]:
query = "How many movies has Tom Cruise been in?"

In [8]:
searchable_entities(query)

{'Tom Cruise', 'movie'}

## Building Wikipedia Corpus

In [9]:
def build_wiki_corpus(search_ents, corpus=None):
    wiki = wikipediaapi.Wikipedia('en')
    
    if corpus:
        ids = [k[0] for k in corpus.keys()]
    else:
        corpus = dict()
        ids = []
    
    for ent in search_ents:
        page = wiki.page(ent)
        if page.exists():
            doc_id = article_id(ent)
            if doc_id not in ids:
                corpus[(doc_id, page.title)] = page.text
                ids.append(doc_id)
            else:
                print(f"Ha! {ent} is already in there!")
    
    return corpus    

In [10]:
ents = searchable_entities("Whats's the difference between a lion, tiger, leopard, cheetah, and meerkat?")
ents

{'cheetah', 'difference', 'leopard', 'lion', 'meerkat', 'tiger'}

In [11]:
old_corps = load_docs()
old_corps

{('Q34706',
  'Leopard'): 'See text\nThe leopard (Panthera pardus) is one of the five extant species in the genus Panthera, a member of the cat family, Felidae. It occurs in a wide range in sub-Saharan Africa, in some parts of Western and Central Asia, Southern Russia, and on the Indian subcontinent to Southeast and East Asia. It is listed as Vulnerable on the IUCN Red List because leopard populations are threatened by habitat loss and fragmentation, and are declining in large parts of the global range. The leopard is considered locally extinct in Hong Kong, Singapore, South Korea, Jordan, Morocco, Togo, the United Arab Emirates, Uzbekistan, Lebanon, Mauritania, Kuwait, Syria, Libya, Tunisia and most likely in North Korea, Gambia, Laos, Lesotho, Tajikistan, Vietnam and Israel.\nContemporary records suggest that the leopard occurs in only 25% of its historical global range.\nCompared to other wild cats, the leopard has relatively short legs and a long body with a large skull. Its fur is

In [12]:
new_corps = build_wiki_corpus(ents, old_corps)

for k, v in new_corps.items():
    print(k, v)
    print(f"\n\n{100*'='}\n")

Ha! tiger is already in there!
Ha! leopard is already in there!
Ha! cheetah is already in there!
Ha! lion is already in there!
('Q34706', 'Leopard') See text
The leopard (Panthera pardus) is one of the five extant species in the genus Panthera, a member of the cat family, Felidae. It occurs in a wide range in sub-Saharan Africa, in some parts of Western and Central Asia, Southern Russia, and on the Indian subcontinent to Southeast and East Asia. It is listed as Vulnerable on the IUCN Red List because leopard populations are threatened by habitat loss and fragmentation, and are declining in large parts of the global range. The leopard is considered locally extinct in Hong Kong, Singapore, South Korea, Jordan, Morocco, Togo, the United Arab Emirates, Uzbekistan, Lebanon, Mauritania, Kuwait, Syria, Libya, Tunisia and most likely in North Korea, Gambia, Laos, Lesotho, Tajikistan, Vietnam and Israel.
Contemporary records suggest that the leopard occurs in only 25% of its historical global r

# Document Clustering

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [89]:
def doc_clustering(corpus, num_clusters=2):
    # Get original documents
    docs = list(corpus.items())
    
    # Preprocess documents
    processed_docs = [
        ' '.join(word_tokenize(doc[1])) 
        for doc in docs
    ]
    
    # Extract features from documents
    vectoriser = TfidfVectorizer()
    X = vectoriser.fit_transform(processed_docs)
    
    # Cluster documents
    kmeans = KMeans(n_clusters=num_clusters, init="random", max_iter=2000)
    kmeans.fit(X)
    
    # Principal components
#     min_x = min(X.shape)
#     num_comps = min(min_x, 3)
#     print(X.toarray(), X.shape, min_x, num_comps)
    num_comps = 3 if num_clusters > 2 else 2
    pca = PCA(n_components=num_comps)
    prin_comps = pca.fit_transform(X.toarray())
    
    # Identify Themes
    cluster_data = {
        "name": [],
        "themes": []
    }
    
    for i in range(num_clusters):
        centroid = kmeans.cluster_centers_[i]
        top_words_idx = centroid.argsort()[::-1][:5]
        top_words = [vectoriser.get_feature_names_out()[idx] for idx in top_words_idx]
        cluster_data["name"].append(i)
        cluster_data["themes"].append(" ".join(top_words))
    
    centroid_comps = pca.fit_transform(kmeans.cluster_centers_)
    
    cluster_data["x_coord"] = centroid_comps[:, 0]
    cluster_data["y_coord"] = centroid_comps[:, 1]
    if num_comps > 2:
        cluster_data["z_coord"] = centroid_comps[:, 2]
    
    doc_data = {
        "doc_name": [doc[0][1] for doc in docs],
        "x_coord": prin_comps[:, 0],
        "y_coord": prin_comps[:, 1],
        "cluster": kmeans.labels_
    }
    if num_comps > 2:
        doc_data.update({"z_coord": prin_comps[:, 2]})
    
    return pd.DataFrame.from_dict(cluster_data), pd.DataFrame.from_dict(doc_data)

def ideal_cluster_number(X, cluster_range=10):
    # Initialise square distance sums and cluster range
    square_distance_sums = []
    n_range = np.arange(2, cluster_range + 1)
    
    for i in n_range:
        kmeans = KMeans(n_clusters=i, init="random", max_iter=2000)
        kmeans.fit(X)
        square_distance_sums.append(kmeans.inertia_)
    
    return n_range, square_distance_sums

def test_ideal_cluster_number(corpus, cluster_range=10):
    # Get original documents
    docs = list(corpus.items())
    
    # Preprocess documents
    processed_docs = [
        ' '.join(word_tokenize(doc[1])) 
        for doc in docs
    ]
    
    # Extract features from documents
    vectoriser = TfidfVectorizer()
    X = vectoriser.fit_transform(processed_docs)
    
    return ideal_cluster_number(X, cluster_range)

In [90]:
# Loading corpora for clustering
def load_corpora_for_clustering(corpora):
    new_corpus = {}
    
    for c in corpora:
        new_corpus.update(load_docs(f"corpus{c}"))
    
    return new_corpus

In [91]:
new_corpus = load_corpora_for_clustering(["", 2, 3, 4, 5, 6])
new_corpus.keys()

dict_keys([('Q34706', 'Leopard'), ('Q677014', 'Lynx'), ('Q19939', 'Tiger'), ('Q140', 'Lion'), ('Q35694', 'Jaguar'), ('Q146', 'Cat'), ('Q23907', 'Cheetah'), ('Q35255', 'Cougar'), ('Q175535', 'Matt Damon'), ('Q37079', 'Tom Cruise'), ('Q41142', 'Jane Fonda'), ('Q234959', 'Hayley Atwell'), ('Q10738', 'Dwayne Johnson'), ('Q40096', 'Will Smith'), ('Q54314', 'Chris Hemsworth'), ('Q873', 'Meryl Streep'), ('Q83488', 'Mesut Özil'), ('Q615', 'Lionel Messi'), ('Q214204', 'Eden Hazard'), ('Q11571', 'Cristiano Ronaldo'), ('Q266613', 'Wayne Rooney'), ('Q142794', 'Neymar'), ('Q207', 'George W'), ('Q6279', 'Joe Biden'), ('Q22686', 'Donald Trump'), ('Q1124', 'Bill Clinton'), ('Q9960', 'Ronald Reagan'), ('Q76', 'Barack Obama'), ('Q23505', 'George H'), ('Q2', 'Earth'), ('Q332', 'Neptune'), ('Q193', 'Saturn'), ('Q525', 'Sun'), ('Q12176', 'Alpha Centauri'), ('Q313', 'Venus'), ('Q324', 'Uranus'), ('Q319', 'Jupiter'), ('Q405', 'Moon'), ('Q308', 'Mercury (planet)'), ('Q339', 'Pluto'), ('Q111', 'Mars'), ('Q2667

In [92]:
cluster_data, doc_data = doc_clustering(new_corpus, 6)





In [93]:
cluster_data

Unnamed: 0,name,themes,x_coord,y_coord,z_coord
0,0,prey cats lynx leopard jaguar,-0.220724,-0.027327,-0.295639
1,1,renault audi bmw tesla toyota,-0.140843,-0.003011,-0.071285
2,2,film scored fonda streep goal,-0.114455,-0.001639,-0.059775
3,3,bush reagan clinton president administration,0.423278,-0.367657,0.024679
4,4,biden obama trump president senate,0.325828,0.430968,0.028723
5,5,earth planet sun solar moon,-0.273084,-0.031334,0.373296


In [94]:
doc_data

Unnamed: 0,doc_name,x_coord,y_coord,cluster,z_coord
0,Leopard,-0.055976,0.038094,0,-0.428154
1,Lynx,-0.065335,0.017081,0,-0.21971
2,Tiger,-0.05903,0.038424,0,-0.363409
3,Lion,-0.061176,0.042922,0,-0.355639
4,Jaguar,-0.057527,0.034957,0,-0.39754
5,Cat,-0.053636,0.031461,0,-0.286167
6,Cheetah,-0.054258,0.045516,0,-0.368491
7,Cougar,-0.060764,0.042417,0,-0.400472
8,Matt Damon,-0.116541,0.020814,2,-0.031874
9,Tom Cruise,-0.113527,0.018331,2,-0.03275


In [95]:
# n_range, sd_sums = test_ideal_cluster_number(new_corpus, 14)

In [96]:
# plot_data(
#     n_range,
#     sd_sums,
#     "Number of Clusters vs Square Distance Sum",
#     "Number of Clusters",
#     "Square Distance Sum",
#     "# Clusters vs SD Sum",
#     mode="lines+markers"
# ).show()

In [97]:
plot_data(
    x=doc_data['x_coord'],
    y=doc_data['y_coord'],
    z=doc_data['z_coord'],
    title="Docs and their Clusters",
    name="Documents",
    mode="markers",
    text=doc_data['doc_name']
).show()

In [98]:
def plot_clusters(doc_data, cluster_data, is_3d=True):
    traces = dict()

    for i, theme in enumerate(cluster_data['themes']):
        cluster_name = f"Cluster {i+1}: {theme}"
        cluster_points = doc_data[doc_data['cluster'] == i]
        
        z = cluster_points['z_coord'] if is_3d else None
        
        new_trace = create_trace(
            x=cluster_points['x_coord'],
            y=cluster_points['y_coord'],
            z=z,
            name=cluster_name,
            mode="markers",
            text=cluster_points['doc_name'],
            marker_size=5
        )
        traces[cluster_name] = new_trace
    
    return plot_data(**traces)

In [99]:
plot_clusters(doc_data, cluster_data)

In [25]:
for c in cluster_data['themes']:
    print(c)

earth planet sun solar moon
prey cats lynx leopard jaguar
film scored fonda streep goal
renault audi bmw tesla toyota
reagan soviet administration president tax
bush obama biden clinton trump


In [32]:
new_corpus

{('Q34706',
  'Leopard'): 'See text\nThe leopard (Panthera pardus) is one of the five extant species in the genus Panthera, a member of the cat family, Felidae. It occurs in a wide range in sub-Saharan Africa, in some parts of Western and Central Asia, Southern Russia, and on the Indian subcontinent to Southeast and East Asia. It is listed as Vulnerable on the IUCN Red List because leopard populations are threatened by habitat loss and fragmentation, and are declining in large parts of the global range. The leopard is considered locally extinct in Hong Kong, Singapore, South Korea, Jordan, Morocco, Togo, the United Arab Emirates, Uzbekistan, Lebanon, Mauritania, Kuwait, Syria, Libya, Tunisia and most likely in North Korea, Gambia, Laos, Lesotho, Tajikistan, Vietnam and Israel.\nContemporary records suggest that the leopard occurs in only 25% of its historical global range.\nCompared to other wild cats, the leopard has relatively short legs and a long body with a large skull. Its fur is

In [105]:
sents = {
    ("a", "b"): "This is a sentence", 
    ("c", "d"): "This is another sentence", 
    ("e", "f"): "This is yet another sentence",
    ("g", "h"): "Getting fed up now!",
}
cd, dd = doc_clustering(sents, 3)





In [106]:
cd

Unnamed: 0,name,themes,x_coord,y_coord,z_coord
0,0,getting fed yet sentence another,0.877283,-0.027428,1.3084100000000001e-17
1,1,another sentence yet getting fed,-0.397563,0.45303,1.3084100000000001e-17
2,2,sentence yet getting fed another,-0.47972,-0.425602,1.3084100000000001e-17


In [107]:
dd

Unnamed: 0,doc_name,x_coord,y_coord,cluster,z_coord
0,b,-0.28408,0.575185,2,-0.153235
1,d,-0.376502,-0.084431,1,0.392577
2,f,-0.325733,-0.469227,1,-0.259397
3,h,0.986315,-0.021527,0,0.020055


In [108]:
plot_clusters(dd, cd, "z_coord" in cd)

In [61]:
"y_coord" in cd

True

In [67]:
len(cd)

2