# Upload corpus 
upload the corpus file (title_author.tab.txt.gz)

In [53]:
#upload files here
# !rm -f small_test.txt || rm -f title_author.small.txt || true
from google.colab import files
files.upload()
!gunzip title_author.tab.txt.gz || true

Saving title_author.tab.txt to title_author.tab (4).txt
gzip: title_author.tab.txt.gz: No such file or directory


# Retrieval Class using sklearn
Execute the cell before running test

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
import numpy as np
import fileinput, collections
from collections import OrderedDict

class TfidfRetriever(object):
  """TfidfRetriever class

  Usage:
  tfidfRetriever = TfidfRetriever(corpus_file_path="corpus.txt")
  tfidfRetriever.find_similar_docs('test text')

  a list of relevant documents in corpus.txt will be returned.
  Author: Uzair Ahmad
  """
  def __init__(self, corpus_file_path):
    """Initialize TfidfRetriever object and computes the tfidf-matrix.

    Parameters
    ----------
    ste : corpus_file_path, The file name and location of the corpus

    Returns
    -------
    object : an object of TfidfRetriever
    """
    self.corpus_file_path = corpus_file_path
    self.__corpus = []
    self.tf_counter = CountVectorizer()
    self.tfidf_transformer = TfidfTransformer(smooth_idf=False, norm=None)
    self.__compute_solution()

  def find_similar_docs(self, text, top_n=10):
    """Vectorize input text and calculates dot-product similarity to the pre-calculated tfidf-matrix.
    This function is visible to user.

    Parameters
    ----------
    text : str
        text to be searched in the corpus for similar texts
    top_n : int
        top n similar documents, default is 10.
    Returns
    -------
    list
        a list of similar texts in the corpus
    """
    # vectorize text (1, voc_size)
    tf_vector = self.tf_counter.transform([text])
    # tfidf of vectorized_text
    tfidf_vector = self.tfidf_transformer.transform(tf_vector).toarray().T
    # dot-product similarity
    dot_product_sim = np.squeeze( self.tf_idf_matrix.dot(tfidf_vector))
    # index of top_n (inverted tail of argsort) in descending order
    dot_product_sim_desc_order = np.argsort(dot_product_sim)[-top_n:][::-1] 
    # prepare list of docs corresponding top_n ordered indexes
    return ['score:{0}, id:{1}, title: {2}, author: {3}'.format(
        np.round(dot_product_sim[i],6), 
        self.__corpus[i].doc_id, 
        self.__corpus[i].title, 
        self.__corpus[i].author) for i in dot_product_sim_desc_order] 
  # 
  def to_df(self):
    """Prepares a pandas DataFrame from the pre-calculated tfidf-matrix. 
    Good for visual verification for small corpus.
    This function is visible to user.

    Parameters
    ----------
    
    Returns
    -------
    Pandas DataFrame
        a dataframe with unique words (vocabulary) as index. 
        First column is idf of each word. 
        Rest of the columns correspond to individual documents in the corpus.
    """
    # tf_matrix, tf_idf_matrix = __compute_solution(self)
    df = pd.DataFrame(pd.Series(self.tfidf_transformer.idf_, 
                                index=self.tf_counter.get_feature_names(), 
                                name="idf"))
    
    for i in range(self.tf_idf_matrix.shape[0]):
      tf = pd.Series(self.tf_matrix.toarray()[i], 
                    index=self.tf_counter.get_feature_names())
      df["tf-doc"+str(i+1)] = tf
      tfidf = pd.Series(self.tf_idf_matrix.toarray()[i], 
                    index=self.tf_counter.get_feature_names())
      df["tfidf-doc"+str(i+1)] = tfidf
      

    df = df.sort_index()
    return df
  
  def __load_corpus(self):
    """Loads text data from the disk into corpus list.
    This function is not visible to user.    
    """
    Document = collections.namedtuple('doc', 'doc_id title author')
    try:
      for line in fileinput.input(self.corpus_file_path):
          doc_id, title, author = line.split('\t') #line.lower().split('\t')
          doc = Document(doc_id=doc_id.replace('\ufeff',""), 
                         title=title, 
                         author=author.strip())
          self.__corpus.append(doc)
    finally:
      fileinput.close()

  def __compute_solution(self):
    """Calculates tfidf-matrix. 
    This function is to be called by __init__ at object creation time.
    Not visible to user.
    Computes two important components of the solution:
      tf_matrix: a numpy sparse matrix of unique words (voc) and their frequencies (tf) in respective documents. shape (doc_count, voc_count)
      tf_idf_matrix:   a numpy matrix of unique words (voc) and their tf*idf in respective documents. shape (doc_count, voc_count)
    """
    self.__load_corpus()
    # transform corpus into tf_matrix (doc_count, voc_count)
    self.tf_matrix = self.tf_counter.fit_transform([doc.title + ' ' + doc.author for doc in self.__corpus])
    # transform tf_matrix into tfidf matrix
    self.tfidf_transformer.fit(self.tf_matrix)
    self.tf_idf_matrix = self.tfidf_transformer.transform(self.tf_matrix)

# change the corpus_file_path to actual file name/path if its different
tfidfRetriever = TfidfRetriever(corpus_file_path="small_test.txt")

## Test
input query to the box and ctl+enter

In [None]:
query = 'horse show how to eats cucumber' #@param {type:"string"}
tfidfRetriever.find_similar_docs(query)

['score:5.733495, id:1, title: horse and cattle show, author: Ace',
 'score:2.866747, id:2, title: cucumber gows in winter, author: Ben']

In [None]:
tfidfRetriever.to_df()

Unnamed: 0,idf,tf-doc1,tfidf-doc1,tf-doc2,tfidf-doc2
ace,1.693147,1,1.693147,0,0.0
and,1.693147,1,1.693147,0,0.0
ben,1.693147,0,0.0,1,1.693147
cattle,1.693147,1,1.693147,0,0.0
cucumber,1.693147,0,0.0,1,1.693147
gows,1.693147,0,0.0,1,1.693147
horse,1.693147,1,1.693147,0,0.0
in,1.693147,0,0.0,1,1.693147
show,1.693147,1,1.693147,0,0.0
winter,1.693147,0,0.0,1,1.693147


# Retrieval Class 2 (using numpy)

In [54]:
import pandas as pd
import numpy as np
import fileinput, collections
from collections import OrderedDict
import re

class TfidfRetrieverUzair(object):
  """TfidfRetriever class

  Usage:
  tfidfRetriever = TfidfRetriever(corpus_file_path="corpus.txt")
  tfidfRetriever.find_similar_docs('test text')

  a list of relevant documents in corpus.txt will be returned.
  Aurhor: Uzair Ahmad
  """
  def __init__(self, corpus_file_path):
    """Initialize TfidfRetriever object and computes the tfidf-matrix.

    Parameters
    ----------
    corpus_file_path : str
        The file location of the corpus

    Returns
    -------
    list
        an object of TfidfRetriever
    """
    self.corpus_file_path = corpus_file_path
    self.__corpus = []
    self.__compute_solution()

  def find_similar_docs(self, text, top_n=10):
    """Vectorize input text and calculates dot-product similarity to the pre-calculated tfidf-matrix.
    This function is visible to user.

    Parameters
    ----------
    text : str
        text to be searched in the corpus for similar texts
    top_n : int
        top n similar documents, default is 10.
    Returns
    -------
    list
        a list of similar texts in the corpus
    """
    tfidf_vector = np.zeros(shape=(len(self.unique_words), 1))
    for t in text.split(): # text.lower().split()
      if t in self.unique_words.keys():
        tfidf_vector[list(self.unique_words).index(t)] = self.unique_words[t]['tidf']
    # dot-product similarity
    dot_product_sim = np.squeeze(self.tfidf_matrix.T.dot(tfidf_vector))
    # index of top_n (inverted tail of argsort) in descending order
    dot_product_sim_desc_order = np.argsort(dot_product_sim)[-top_n:][::-1] 
    # prepare list of docs corresponding top_n ordered indexes
    return ['score:{0}, id:{1}, title: {2}, author: {3}'.format(
        np.round(dot_product_sim[i],6), 
        self.__corpus[i].doc_id, 
        self.__corpus[i].title, 
        self.__corpus[i].author) for i in dot_product_sim_desc_order] 
  # 
  def to_df(self):
    """Prepares a pandas DataFrame from the pre-calculated tfidf-matrix. 
    Good for visual verification for small corpus.
    This function is visible to user.

    Parameters
    ----------
    
    Returns
    -------
    Pandas DataFrame
        a dataframe with unique words (vocabulary) as index. 
        First column is idf of each word. 
        Rest of the columns correspond to individual documents in the corpus.
    """
    # tf_matrix, tf_idf_matrix = __compute_solution(self)
    df = pd.DataFrame(data=self.tfidf_matrix, 
                      index=self.unique_words.keys(), 
                      columns=['tfidf-doc'+str(i+1) for i in range(len(self.__corpus))]).sort_index()
    return df

  def __load_corpus(self):
    """Loads text data from the disk into corpus list.
    This function is not visible to user.    
    """
    Document = collections.namedtuple('doc', 'doc_id title author')
    try:
      for line in fileinput.input(self.corpus_file_path):
          doc_id, title, author = line.split('\t') #line.lower().split('\t')
          doc = Document(doc_id=doc_id.replace('\ufeff',""), 
                         title=re.sub('[^0-9a-zA-Z\s]+', ' ', title), 
                         author=re.sub('[^0-9a-zA-Z\s]+', ' ', author).strip())
          self.__corpus.append(doc)
    finally:
      fileinput.close()

  def __compute_solution(self):
    self.__load_corpus()
    # word freq in each doc
    # unique_words and doc_ids where they appear
    tf_in_docs = {}   # word freq in each doc
    self.unique_words = {} # unique_words and doc_ids where they appear
    for doc in self.__corpus:
      tf = {}
      for t in '{0} {1}'.format(doc.title, doc.author).split():
        if len(t) < 1:
          continue
        if t not in self.unique_words.keys():
          self.unique_words[t] = {"doc_ids":1}#{"doc_ids":[doc.doc_id]}
        else:
          self.unique_words[t]["doc_ids"] += 1 #["doc_ids"].append(doc.doc_id)
        if t not in tf.keys():
          tf[t] = 1
        else:
          tf[t] += 1
      tf_in_docs[doc.doc_id] = tf

    for t_id, t in enumerate(self.unique_words.keys()):
      # calc tidf
      #self.unique_words[t]['tidf'] = np.log(len(self.__corpus) / (len(self.unique_words[t]['doc_ids'])))+1
      self.unique_words[t]['tidf'] = np.log(len(self.__corpus) / self.unique_words[t]['doc_ids'])+1

    
    self.tfidf_matrix = np.zeros(shape=(len(self.unique_words.keys()), len(self.__corpus)))
    for doc_id, doc in enumerate(self.__corpus):
      for t in '{0} {1}'.format(doc.title, doc.author).split():
        tfidf = self.unique_words[t]['tidf'] * tf_in_docs[doc.doc_id][t]
        self.tfidf_matrix[list(self.unique_words).index(t), doc_id] = tfidf
# change the corpus_file_path to actual file name/path if its different
tfidfRetrieverUzi = TfidfRetrieverUzair(corpus_file_path="title_author.tab (4).txt") #"title_author.tab (3).txt"

## Test
input query to the box and ctl+enter

In [59]:
test_text = 'Network Engineer' #@param {type:"string"}
tfidfRetrieverUzi.find_similar_docs(test_text)

['score:40.439409, id:26, title:  Cisco  DNS  HTTP  Networking  Network Engineer  Security  Video  VPN  Wireless , author: Network Engineer',
 'score:20.219705, id:30, title:  Firewalls  load balancing  routing  switching  cable modems  wireless , author: Network Engineer',
 'score:7.723086, id:43, title:  Altera  Assembly  CCA  Circuit  Defense Systems  Development  Electrical Engineer  Flash  Graphics  Hardware  HTTP  Materials  Matlab  Release  Security  Simulation  Testing , author: Engineer Electronics 4',
 'score:7.723086, id:60, title: GSC ITS   Lead Data Engineer, author: GSC ITS   Lead Data Engineer',
 'score:7.723086, id:58, title: QA Performance Engineer Beanshell C a must, author: QA Performance Engineer Beanshell C a must',
 'score:7.723086, id:17, title: Desktop Support Workstation Engineer, author: Desktop Support Workstation Engineer',
 'score:7.723086, id:53, title: Sr  Lab Systems Engineer, author: Sr  Lab Systems Engineer',
 'score:7.723086, id:48, title: Sr  Softwar