# Multinomial Language Model

This notebook impliments a multinomial language model for information retrieval of documents based on a query search. The documents are scored and ranked for similarity, i.e. probality of matching relevance, against a collection of queries.

## Imports and setup

In [None]:
import math
import numpy as np
import pandas as pd
from collections import Counter
import csv
import os
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.text import log

nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## Part 1 - Ranking by document titles
In this section we score each search query for document title and create a shortlist of the top 100 relevant documents (by title).

### Setup

In [None]:
# Create base dataframe for recording results
df_Results = pd.DataFrame(columns=['Query_ID','Doc_ID', 'Multinomial_Score','Query_Desc', 'Doc_Desc'])

In [None]:
df_Results.drop(df_Results.index,inplace=True)

### Bring in the data

Indexed queries and documents preprepared from previous notebook

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

Document titles file

In [None]:
# Import from prepared CSV file - read doc IDs and titles to array
with open('Indexed_Titles.csv', 'r') as file:
   reader = csv.reader(file)
   documents = []
   documentIDs = []
   for row in reader:
        documentIDs.append(row[1])
        documents.append(row[2])

Search queries file

In [None]:
# Import from prepared CSV file - read query IDs and search strings to array
with open('Indexed_Queries.csv', 'r') as file:
    reader = csv.reader(file)
    queries = []
    queryIDs = []
    for row in reader:
        queries.append(row[2])
        queryIDs.append((row[1]))

### Preprocessing

In [None]:
# Tokenize the documents into words
tokenized_docs = []
for doc in documents:
    words = doc.lower().split()
    words = [word for word in words if word not in stop_words]
    tokenized_docs.append(words)

# Compute the vocabulary
vocab = set([word for doc in tokenized_docs for word in doc])

# Compute the document-term matrix
doc_term_matrix = np.zeros((len(documents), len(vocab)))
for i, doc in enumerate(tokenized_docs):
    for j, word in enumerate(vocab):
        doc_term_matrix[i, j] = doc.count(word)

In [None]:
preprocessed_docs = []
for doc in documents:
    words = doc.lower().split()
    words = [word for word in words if word not in stop_words]
    preprocessed_docs.append(words)

### Process queries and scores

For each query, a similarity score is computed for every document

In [None]:
current_query = 0
# For each query
for query in queries:

  doc_scores = []

  rawquery = queries[current_query]
  queryID = queryIDs[current_query]

  tokenized_query = query.lower().split()
  tokenized_query = [string for string in tokenized_query if string not in stop_words]  

  # Compute the query-term vector
  query_term_vector = np.zeros(len(vocab))
  for i, word in enumerate(vocab):
      query_term_vector[i] = tokenized_query.count(word)

  # Compute the document scores
  doc_scores = np.dot(doc_term_matrix, query_term_vector)

  current_score = 0
  # For each computed similarity score
  for score in doc_scores:
    #print("Query: " + str(current_query) + " Score: " + str(current_score) + " " + str(score))
    # Append a new row to the results dataframe
    new_row = [int(queryID), int(documentIDs[current_score]), score, rawquery, documents[current_score]]
    df_Results = df_Results.append(pd.Series(new_row, index=df_Results.columns), ignore_index=True)
    current_score += 1  
  
  current_query += 1

Sort the results: group by query ID, then sorted by scores ascending for each query. Finally, optionally, retain only top results for each query search, e.g. 10, 50, 100...

In [None]:
df_SortedResults = df_Results.sort_values(by=['Query_ID', 'Multinomial_Score'], ascending=[True, False])

In [None]:
# Restrict to top 100 results
df_TopResults = df_SortedResults.groupby('Query_ID').head(100).reset_index(drop=True)

In [None]:
df_TopResults.insert(4, 'Rank',0)

In [None]:
df_TopResults['Rank'] = df_TopResults.groupby('Query_ID').cumcount() + 1

In [None]:
# Export final results to CSV for final analysis (outside of this notebook)
df_TopResults.to_csv("Export_Multinomial_Top100_by_Title.csv")

## Part 2 - Ranking by document contents
In this section we score each search query for document contents (main body of the document) and create a shortlist of the top 100 relevant documents (by contents).

### Setup

In [None]:
# Create base dataframe for recording results
df_Results = pd.DataFrame(columns=['Query_ID','Doc_ID', 'Multinomial_Score'])

In [None]:
df_Results.drop(df_Results.index,inplace=True)

### Bring in the data

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

Indexed queries and documents preprepared from previous notebook

Document titles file

In [None]:
# Import from prepared CSV file - read doc IDs and titles to array
with open('Indexed_Contents.csv', 'r') as file:
   reader = csv.reader(file)
   documents = []
   documentIDs = []
   for row in reader:
        documentIDs.append(row[1])
        documents.append(row[2])

Search queries file

In [None]:
# Import from prepared CSV file - read query IDs and search strings to array
with open('Indexed_Queries.csv', 'r') as file:
    reader = csv.reader(file)
    queries = []
    queryIDs = []
    for row in reader:
        queries.append(row[2])
        queryIDs.append((row[1]))

### Preprocessing

In [None]:
# Tokenize the documents into words
tokenized_docs = []
for doc in documents:
    words = doc.lower().split()
    words = [word for word in words if word not in stop_words]
    tokenized_docs.append(words)

# Compute the vocabulary
vocab = set([word for doc in tokenized_docs for word in doc])

# Compute the document-term matrix
doc_term_matrix = np.zeros((len(documents), len(vocab)))
for i, doc in enumerate(tokenized_docs):
    for j, word in enumerate(vocab):
        doc_term_matrix[i, j] = doc.count(word)

In [None]:
preprocessed_docs = []
for doc in documents:
    words = doc.lower().split()
    words = [word for word in words if word not in stop_words]
    preprocessed_docs.append(words)

### Similarity scoring

For each query, a similarity score is computed for every document

In [None]:
current_query = 0
# For each query
for query in queries:

  doc_scores = []

  rawquery = queries[current_query]
  queryID = queryIDs[current_query]

  tokenized_query = query.lower().split()
  tokenized_query = [string for string in tokenized_query if string not in stop_words]  

  # Compute the query-term vector
  query_term_vector = np.zeros(len(vocab))
  for i, word in enumerate(vocab):
      query_term_vector[i] = tokenized_query.count(word)

  # Compute the document scores
  doc_scores = np.dot(doc_term_matrix, query_term_vector)

  current_score = 0
  # For each computed similarity score
  for score in doc_scores:
    #print("Query: " + str(current_query) + " Score: " + str(current_score) + " " + str(score))
    # Append a new row to the results dataframe
    new_row = [int(queryID), int(documentIDs[current_score]), score]
    df_Results = df_Results.append(pd.Series(new_row, index=df_Results.columns), ignore_index=True)
    current_score += 1  
  
  current_query += 1

Sort the results: group by query ID, then sorted by scores ascending for each query. Finally, optionally, retain only top results for each query search, e.g. 10, 50, 100...

In [None]:
df_TopResults

In [None]:
df_SortedResults = df_Results.sort_values(by=['Query_ID', 'Multinomial_Score'], ascending=[True, False])

In [None]:
# Restrict to top X results
df_TopResults = df_SortedResults.groupby('Query_ID').head(100).reset_index(drop=True)

In [None]:
df_TopResults['Rank'] = df_TopResults.groupby('Query_ID').cumcount() + 1

In [None]:
df_TopResults.to_csv("Export_Multinomial_Top100_Queries_by_Content.csv")