Name: Aditya Saripalli  
Roll No: 20173071  

In [1]:
import os
import re
import numpy as np
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from numpy import linalg as lg

In [2]:
# Creating complete feature space from training data.
def create_word_vectors(training_data_path):
    #feature space - containing all the distinct words from all the documents
    bag_of_words = set()
    #A dictionary of word vectors with key=filename and
    #value=A Counter Object representing word vector of that file
    d_word_vectors = {}
    # A dictionary with key="class_label" and 
    # value=list of file_indices ("class_label/filename")
    d_class_files = defaultdict(list)
    #get the sub directories list
    dir_list = os.listdir(training_data_path)

    for class_label in dir_list:
        source_dir = training_data_path + "/" + class_label
        file_paths = [source_dir+"/"+file  for file in os.listdir(source_dir)]
        #Read the all files and create the feature space
        for file_path in file_paths:
            #ignoring unicode characters while opening the data file
            data_file = open(file_path, "r", encoding='utf-8', errors='ignore')
            word_vector = Counter();
            for line in data_file.readlines():
                #exracting only lower case words
                line_tokens = re.findall(r"\b[A-Za-z][a-z]{2,26}\b", line)
                line_tokens = [token.lower() for token in line_tokens]
                line_tokens = list(set(line_tokens))
                for word in line_tokens: word_vector[word] += 1
                bag_of_words |= set(line_tokens)
            data_file.close()
            file_index = class_label + "/" + file_path.split("/")[-1]
            d_word_vectors[file_index] = word_vector
            d_class_files[class_label].append(file_index)

    return list(bag_of_words), d_word_vectors, d_class_files

In [3]:
def compute_word_freq_vectors(bag_of_words, d_word_vectors):
    #dictionary of word frequency vectors
    d_word_freq_vectors = {}
    for key,value in d_word_vectors.items():
        word_freq_vector = []
        for word in bag_of_words:
            word_freq_vector += [value[word]]
        d_word_freq_vectors[key] = word_freq_vector

    return d_word_freq_vectors

In [4]:
#Creating word frequency Matrix
def create_freq_matrix(bag_of_words, d_word_freq_vectors):
    #create the term frequency matrix from word frequency vectors
    freq_term_matrix = np.array([])
    file_index_array = np.array([])

    #sort the dictionary on keys
    for key,value in d_word_freq_vectors.items():
        freq_term_matrix = np.array(value) if freq_term_matrix.size == 0 \
                                           else np.vstack((freq_term_matrix, np.array(value)))
        file_index_array = np.array(key) if file_index_array.size == 0 \
                                           else np.vstack((file_index_array, np.array(key)))
    
    return freq_term_matrix, file_index_array

In [5]:
#Adding class labels to the word frequency matrix for ease of computation
def add_class_data(tf_matrix, file_index_array):
    #get the dimensions of the tfidf matrix
    n_records, n_features = tf_matrix.shape
    #adding the class labels
    tf_matrix = np.insert(tf_matrix, n_features, '0', axis=1)
    for index, file_index in enumerate(file_index_array):
        class_label = file_index[0].split("/")[0]
        tf_matrix[index][n_features] = class_label

    return tf_matrix


In [6]:
# Creating word vector for the input file.
def create_word_vector(filename):
    #ignoring unicode characters while opening the data file
    data_file = open(filename, "r", encoding='utf-8', errors='ignore')
    word_vector = Counter();
    for line in data_file.readlines():
        #exracting only lower case words
        line_tokens = re.findall(r"\b[A-Za-z][a-z]{2,26}\b", line)
        line_tokens = [token.lower() for token in line_tokens]
        line_tokens = list(set(line_tokens))
        for word in line_tokens: word_vector[word] += 1
    data_file.close()

    return word_vector

In [7]:
def compute_word_freq_vector(bag_of_words, word_vector):
    #dictionary of word frequency vectors
    _word_freq_vector = []
    for word in bag_of_words:
        _word_freq_vector += [word_vector[word]]

    return _word_freq_vector

In [8]:
def fetch_relevant_docs(file_path, bag_of_words, c_tf_matrix):
    #create word frequency vectors for the given document
    word_vector = create_word_vector(file_path)
    _word_freq_vector = compute_word_freq_vector(bag_of_words, word_vector)
    word_freq_vector = np.array(_word_freq_vector)

    d_cosine_values = {}
    for vector in c_tf_matrix:
        w_vector = np.array(vector[:-1])
        c_label = vector[-1]
        cosine_value = np.dot(word_freq_vector, w_vector) / (lg.norm(word_freq_vector) * lg.norm(w_vector))
        d_cosine_values[cosine_value] = c_label
    
    return d_cosine_values

In [9]:
##################################################################################
#              Latent Semantic Analysis - Fetch Relevant Documents               #
##################################################################################

# Creating the feature space (Bag of all the words from the corpus)
# and a dictionary of word vectors corresponding to each input file

bag_of_words, d_word_vectors, d_class_files = create_word_vectors("./q2data/train")
class_labels = [float(label) for label in list(d_class_files.keys())]

# compute the word frequency vectors
d_word_freq_vectors = compute_word_freq_vectors(bag_of_words, d_word_vectors)

#create the word frequency matrix of the complete data set
tf_matrix, file_index_array = create_freq_matrix(bag_of_words, d_word_freq_vectors)

c_tf_matrix = add_class_data(tf_matrix, file_index_array)

#fetch the relevant document
d_cosine_values = fetch_relevant_docs("./q2data/train/2/278.txt", bag_of_words, c_tf_matrix)

l_cosine_values = [key for key in d_cosine_values]
l_cosine_values.sort(reverse=True)
l_relevant_cosine_values = l_cosine_values[0:10]

l_relevant_class_labels = [int(d_cosine_values[x]) for x in l_relevant_cosine_values]
file_class_label, n_occ = Counter(l_relevant_class_labels).most_common(1)[0]

print("Class label of the given document is: {}".format(file_class_label))


Class label of the given document is: 2
