<a href="https://colab.research.google.com/github/ashwinjosep/swm-patent-dashboard/blob/master/LDA_Document_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSE 573 - Semantic Web Mining | Spring 2020 | Group 19
# Document clustering and visualization

**Setting up of environment and imports**

In [1]:
import os
import re
import time
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from gensim import corpora
from gensim.models import LdaModel
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from scipy.stats import entropy

  import pandas.util.testing as tm


In [2]:
''' 
Run this command before processing
nltk.download('punkt')
'''

'''
Constants
'''
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
NUMBER_OF_SIMILAR_DOCS = 15
NUMBER_OF_TOPICS = 10
OUTPUT_PATH_DIR = '/content/gdrive/results/'

'''
Variables
'''
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Use Google drive to load and store data files. 
Mounting drive as storage.**

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

**Funtion definitions**

In [0]:
def initial_clean(text):
    """
    Function to clean text of websites, email addresses and any punctuation
    We also lower case the text
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower()  # lower case the text
    text = nltk.word_tokenize(text)
    return text

In [0]:
def remove_stop_words(text):
    """
    Function that removes all stopwords from text
    """
    return [word for word in text if word not in stop_words]


def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1]  # make sure we have no 1 letter words
    except IndexError:  # the word "oed" broke this, so needed try except
        pass
    return text


def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))


In [0]:
def train_lda(data):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    chunksize = 300
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=NUMBER_OF_TOPICS, id2word=dictionary,
                   alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=3, dtype=np.float64)
    t2 = time.time()
    print("Time to train LDA model on ", len(data), "articles: ", (t2 - t1) / 60, "min")
    return dictionary, corpus, lda



In [0]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None, :].T  # take transpose
    q = matrix.T  # transpose matrix
    print(p.shape)
    print(q.shape)
    m = 0.5 * (p + q)
    return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))

In [0]:
def get_most_similar_documents(query, matrix, k=NUMBER_OF_SIMILAR_DOCS):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query, matrix)  # list of jensen shannon distances
    return np.partition(sims, -k)[-k:], sims.argsort()[
                                        :k]  # the top k positional index of the smallest Jensen Shannon distances

**Read data from full_text.csv and store them into parts for faster processing**

In [0]:
df = pd.read_csv('/content/gdrive/My Drive/full_text.csv', usecols=['text', 'patent'])
split_df = np.split(df, [10000, 20000, 30000, 40000, 50000, 60000, 70000])

Call the *clean_and_tokenize()* method for all the dataframe splits and store in separate pickle files so that they can be run parallely in batches for faster processing

In [0]:
def clean_and_tokenize():
    df = df[df['text'].map(type) == str]
    df.dropna(axis=0, inplace=True, subset=['text'])
    # shuffle the data
    df = df.sample(frac=1.0)
    df.reset_index(drop=True, inplace=True)
    # print(df.shape)
    # print(df.head())

    print("==> Cleaning and tokenizing")
    t1 = time.time()
    df['tokenized'] = df['text'].apply(apply_all)
    t2 = time.time()
    print("Time to clean and tokenize", len(df), "articles:", (t2 - t1) / 60, "min")
    df.to_pickle('/content/gdrive/My Drive/tokenized_df.pickle')

Read data from different parts or file and concatenate them into a single dataframe for further processing

In [0]:
df1 = pd.read_pickle('/content/gdrive/My Drive/tokenized_df1.pickle')
df1.drop(['text'], axis=1, inplace=True)
df2 = pd.read_pickle('/content/gdrive/My Drive/tokenized_df2.pickle')
df2.drop(['text'], axis=1, inplace=True)
df3 = pd.read_pickle('/content/gdrive/My Drive/tokenized_df3.pickle')
df3.drop(['text'], axis=1, inplace=True)
df4 = pd.read_pickle('/content/gdrive/My Drive/tokenized_df4.pickle')
df4.drop(['text'], axis=1, inplace=True)
df5 = pd.read_pickle('/content/gdrive/My Drive/tokenized_df5.pickle')
df5.drop(['text'], axis=1, inplace=True)
df6 = pd.read_pickle('/content/gdrive/My Drive/tokenized_df6.pickle')
df6.drop(['text'], axis=1, inplace=True)
df7 = pd.read_pickle('/content/gdrive/My Drive/tokenized_df7.pickle')
df7.drop(['text'], axis=1, inplace=True)
tokenized_df = pd.concat([df1, df2, df3, df4, df5, df6, df7])
print(tokenized_df.shape)

Delete the smaller dataframes to reduce usage of RAM space in colab

In [0]:
del [df7, df6, df5, df4, df3, df2, df1]

Build corpus of all the words from the tokenized text

In [0]:
    df = tokenized_df
    print("==> Building corpus")
    # first get a list of all words
    all_words = [word for item in list(df['tokenized']) for word in item]
    # use nltk fdist to get a frequency distribution of all words
    fdist = FreqDist(all_words)
    # print(len(fdist))  # number of unique words

    # choose k and visually inspect the bottom 10 words of the top k
    k = 15000

    # define a function only to keep words in the top k words
    top_k_words, _ = zip(*fdist.most_common(k))
    top_k_words = set(top_k_words)

    def keep_top_k_words(text):
        return [word for word in text if word in top_k_words]

    df['tokenized'] = df['tokenized'].apply(keep_top_k_words)
    del all_words
    del top_k_words

**Train the LDA model**

In [0]:
    train_df = df
    train_df.reset_index(drop=True, inplace=True)

    print("==> Performing LDA")
    t3 = time.time()
    dictionary, corpus, lda = train_lda(train_df)
    t4 = time.time()
    print("Time to perfrom LDA:", (t3 - t4) / 60, "min")


Save the LDA model, dictionary and corpus so that they can be reused

In [0]:
import pickle
filename = '/content/gdrive/My Drive/SWM_Final/finalized_lda_model.sav'
pickle.dump(lda, open(filename, 'wb'))

dictionary.save('/content/gdrive/My Drive/SWM_Final/lda_dict.pickle')
dictionary.save_as_text('/content/gdrive/My Drive/SWM_Final/lda_dict.txt')

with open('/content/gdrive/My Drive/SWM_Final/lda_corpus.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(corpus, filehandle)
    
train_df.to_pickle('/content/gdrive/My Drive/training_df.pickle')    

Load the saved model, corpus and training data

In [0]:
import pickle
with open('/content/gdrive/My Drive/SWM_Final/lda_corpus.data', 'rb') as filehandle:
      # read the data as binary data stream
      corpus = pickle.load(filehandle)

with open('/content/gdrive/My Drive/SWM_Final/finalized_lda_model.sav', 'rb') as filehandle:
  lda = pickle.load(filehandle)  

train_df = pd.read_pickle('/content/gdrive/My Drive/SWM_Final/train_df.pickle')
train_df = train_df.apply(pd.to_numeric)

Get the topic distribution matrix of all the training documents

In [0]:
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])
print(doc_topic_dist.shape)

Get the top most cited 1000 patents

In [0]:
    from ast import literal_eval

    patent_citations_df = pd.read_csv('/content/gdrive/My Drive/SWM_Final/patent_dict_1000.csv', usecols=['patent', 'number', 'citations'],
                                      nrows=1000)
    patent_citations_df.loc[:, 'citations'] = patent_citations_df.loc[:, 'citations'].apply(lambda x: literal_eval(x))

Compute similarities of all the top 1000 patents with the patents that are citing them and return the top 15 most similar documents and write them into a csv file called 'lda_similarities.csv' so that it can be used for Visualisation.

In [0]:
 with open('/content/gdrive/My Drive/SWM_Final/lda_similarities.csv', 'w') as fo:
    for index, row in patent_citations_df.iterrows():
        patent = row.values.tolist()[0]
        patent_index = train_df.index[train_df['patent'].isin([patent])]
        if len(patent_index.values.tolist()) == 0:
          continue
        patent_topic_dist = doc_topic_dist[patent_index][0]
        citations = list(map(int, row.values.tolist()[2]))
        indexes = train_df.index[train_df['patent'].isin(citations)].tolist()
        cited_patents =  train_df[train_df['patent'].isin(citations)]
        cited_patents.reset_index(drop=True, inplace=True)
        cited_topic_dist = doc_topic_dist[[indexes], :][0]
        similarity_values, most_sim_ids = get_most_similar_documents(patent_topic_dist, cited_topic_dist)
        most_similar_df = cited_patents[cited_patents.index.isin(most_sim_ids)]['patent']
        patient_df = most_similar_df.to_numpy(dtype=str)
        sim_values = np.vstack((patient_df, similarity_values)).T.tolist()
        similarity = get_similarity_format(sim_values)
        fo.write(str(patent) + "," + str(similarity).strip('[]') + '\n')

# Evaluation

In [0]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

Load the saved LDA model, corpus and dictionary

In [0]:
with open('/content/gdrive/My Drive/SWM_Final/finalized_lda_model.sav', 'rb') as filehandle:
      # read the data as binary data stream
      ldamodel = pickle.load(filehandle)

dictionary = Dictionary.load('/content/gdrive/My Drive/SWM_Final/lda_dict.pickle')

with open('/content/gdrive/My Drive/SWM_Final/lda_corpus.data', 'rb') as filehandle:
      # read the data as binary data stream
      corpus = pickle.load(filehandle)

Calculate the u_mass score for getting the topic coherence of LDA model. 

In [0]:
goodcm = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
print(goodcm.get_coherence())