

# Vector Space Model

This notebook impliments a VSM, using Cosine Similarity, for information retrieval of documents based on a query search. The documents are scored and ranked for similarity against a collection of queries.

## Imports and setup

In [None]:
import math
import numpy as np
import pandas as pd
import csv
import os
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.text import log

nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## Part 1 - Ranking by document titles
In this section we score each search query for document title and create a shortlist of the top 100 relevant documents (by title).

### Setup

In [None]:
# Create base dataframe for recording results
df_Results = pd.DataFrame(columns=['Query_ID','Doc_ID', 'Cosine_Score','Query_Desc', 'Doc_Desc'])

In [None]:
df_Results.drop(df_Results.index,inplace=True)

### Bring in the data

Indexed queries and documents preprepared from previous notebook

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

Document titles file

In [None]:
# Import from prepared CSV file - read doc IDs and titles to array
with open('Indexed_Titles.csv', 'r') as file:
    reader = csv.reader(file)
    documents = []
    documentIDs = []
    for row in reader:
        documentIDs.append(row[1])
        documents.append(row[2])

Search queries file

In [None]:
# Import from prepared CSV file - read query IDs and search strings to array
with open('Indexed_Queries.csv', 'r') as file:
    reader = csv.reader(file)
    queries = []
    queryIDs = []
    for row in reader:
        queries.append(row[2])
        queryIDs.append((row[1]))

### Vectorisation

Preprocessing and stopwords removal

In [None]:
# Split document titles into individual words and remove stop words
def preprocess(documents):
    #stop_words = ["the", "a", "an", "and", "or", "in", "on", "at", "is"]
    preprocessed_docs = []
    for doc in documents:
        words = doc.lower().split()
        words = [word for word in words if word not in stop_words]
        preprocessed_docs.append(words)
    return preprocessed_docs

In [None]:
preprocessed_docs = preprocess(documents)

In [None]:
# Create vocabulary from the documents
vocab = sorted(set(word for doc in preprocessed_docs for word in doc))

Vectorisation

In [None]:
# Convert title document into a vector representation using the vocabulary
def vectorize(doc, vocab):
    vector = np.zeros(len(vocab))
    for word in doc:
        if word in vocab:
            vector[vocab.index(word)] += 1
    return vector

In [None]:
# Vectorize preprocessed documents
vectors = [vectorize(doc, vocab) for doc in preprocessed_docs]

### Similarity

Compute cosine similarity between two vectors

In [None]:
def cosine_similarity(u, v):
    score = np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
    if np.isnan(score):
      # To cater for values so close to zero they are being treated as NAN
      score = 0
    return score

### Process queries

For each query, a similarity score is computed for every document

In [None]:
current_query = 0
# For each query
for item in queries:

  rawquery = queries[current_query]
  query = queries[current_query]
  queryID = queryIDs[current_query]
  query = query.split()

  for i in range(len(query)):
      query[i] = query[i].lower()
  query = [string for string in query if string not in stop_words]  
  
  query_vec = vectorize(query, vocab)

  # Compute cosine similarity for all documents (previously vectorised above)
  similarities = [cosine_similarity(query_vec, vector) for vector in vectors]

  current_score = 0
  # For each computed similarity score
  for score in similarities:
    # Append a new row to the results dataframe
    new_row = [int(queryID), int(documentIDs[current_score]), score, rawquery, documents[current_score]]
    df_Results = df_Results.append(pd.Series(new_row, index=df_Results.columns), ignore_index=True)
    current_score += 1
  
  current_query += 1

Sort the results: group by query ID, then sorted by scores ascending for each query. Finally, optionally, retain only top results for each query search, e.g. 10, 50, 100...

In [None]:
df_SortedResults = df_Results.sort_values(by=['Query_ID', 'Cosine_Score'], ascending=[True, False])

In [None]:
# Restrict to top 100 results
df_TopResults = df_SortedResults.groupby('Query_ID').head(100).reset_index(drop=True)

In [None]:
df_TopResults.insert(4, 'Rank',0)

In [None]:
df_TopResults['Rank'] = df_TopResults.groupby('Query_ID').cumcount() + 1

In [None]:
# Export final results to CSV for final analysis (outside of this notebook)
df_TopResults.to_csv("Export_VSM_Top100_by_Title.csv")

## Part 2 - Ranking by document contents
In this section we score each search query for document contents (main body of the document) and create a shortlist of the top 100 relevant documents (by contents).

### Setup

In [None]:
# Create base dataframe for recording results
df_Results = pd.DataFrame(columns=['Query_ID','Doc_ID', 'Cosine_Score'])

In [None]:
df_Results.drop(df_Results.index,inplace=True)

### Bring in the data

Indexed queries and documents preprepared from previous notebook

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

Document titles file

In [None]:
# Import from prepared CSV file - read doc IDs and titles to array
with open('Indexed_Contents.csv', 'r') as file:
    reader = csv.reader(file)
    documents = []
    documentIDs = []
    for row in reader:
        documentIDs.append(row[1])
        documents.append(row[2])

Search queries file

In [None]:
# Import from prepared CSV file - read query IDs and search strings to array
with open('Indexed_Queries', 'r') as file:
    reader = csv.reader(file)
    queries = []
    queryIDs = []
    for row in reader:
        queries.append(row[2])
        queryIDs.append((row[1]))

### Vectorisation

Preprocessing and stopwords removal

In [None]:
# Split document titles into individual words and remove stop words
def preprocess(documents):
    #stop_words = ["the", "a", "an", "and", "or", "in", "on", "at", "is"]
    preprocessed_docs = []
    for doc in documents:
        words = doc.lower().split()
        words = [word for word in words if word not in stop_words]
        preprocessed_docs.append(words)
    return preprocessed_docs

In [None]:
preprocessed_docs = preprocess(documents)

In [None]:
# Create vocabulary from the documents
vocab = sorted(set(word for doc in preprocessed_docs for word in doc))

Vectorisation

In [None]:
# Convert title document into a vector representation using the vocabulary
def vectorize(doc, vocab):
    vector = np.zeros(len(vocab))
    for word in doc:
        if word in vocab:
            vector[vocab.index(word)] += 1
    return vector

In [None]:
# Vectorize preprocessed documents
vectors = [vectorize(doc, vocab) for doc in preprocessed_docs]

### Similarity

Compute cosine similarity between two vectors

In [None]:
def cosine_similarity(u, v):
    score = np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
    if np.isnan(score):
      # To cater for values so close to zero they are being treated as NAN
      score = 0
    return score

### Process queries

For each query, a similarity score is computed for every document

In [None]:
current_query = 0
# For each query
for item in queries:

  rawquery = queries[current_query]
  query = queries[current_query]
  queryID = queryIDs[current_query]
  query = query.split()

  for i in range(len(query)):
      query[i] = query[i].lower()
  query = [string for string in query if string not in stop_words]  
  
  query_vec = vectorize(query, vocab)

  # Compute cosine similarity for all documents (previously vectorised above)
  similarities = [cosine_similarity(query_vec, vector) for vector in vectors]

  current_score = 0
  # For each computed similarity score
  for score in similarities:
    # Append a new row to the results dataframe
    new_row = [int(queryID), int(documentIDs[current_score]), score]
    df_Results = df_Results.append(pd.Series(new_row, index=df_Results.columns), ignore_index=True)
    current_score += 1
  
  current_query += 1

Sort the results: group by query ID, then sorted by scores ascending for each query. Finally, optionally, retain only top results for each query search, e.g. 10, 50, 100...

In [None]:
df_SortedResults = df_Results.sort_values(by=['Query_ID', 'Cosine_Score'], ascending=[True, False])

In [None]:
# Restrict to top 100 results
df_TopResults = df_SortedResults.groupby('Query_ID').head(100).reset_index(drop=True)

In [None]:
df_TopResults['Rank'] = df_TopResults.groupby('Query_ID').cumcount() + 1

In [None]:
df_TopResults.to_csv("Export_VSM_Top100_by_Content.csv")