In [1]:
import re
import os
import math
import random
#from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from numpy import dot
from numpy.linalg import norm
import pandas as pd

In [2]:
def read_file():

    collection = {}
    docId = 1
    classes = []

    for folder in os.listdir("C:/Users/Areeka Aijaz/Desktop/IR Assignment 3/bbcsport"):

        N = 0
        
        if not folder.endswith(".TXT"):
            folderPath = 'C:/Users/Areeka Aijaz/Desktop/IR Assignment 3/bbcsport/' + folder

            for document in os.listdir(folderPath):
                N += 1
                collection[docId] = []
                docPath = folderPath + '/' + document
                tokens = tokenize(docPath)
                collection[docId].extend(tokens)
                docId += 1

            classes.append((folder,N))
                
    return len(collection),collection,classes

In [3]:
def tokenize(docPath):
    
    symbols = ['$','[',']','.','?',';',':','-','!','--',',','...','0','1','2','3','4','5','6','7','8','9','\\','"','(',')','{','}','%','&','/','',"'re","n't","'s","'ve","'d","'m"]

    nt_symbols = ['-','.',']',"n't",'--','...']
    
    tokens = []
    
    flag = False
    
    doc = open(docPath,'r')

    for line in doc:
        
        for word in line.split():
            
            for sym in symbols:
                if sym in word and sym not in nt_symbols:
                    word = word.replace(sym,'')

            if word != '' and word[len(word)-1] == '.':
                word = word.replace('.','')

            if '-' in word :
                word = re.split(r'[-.]',word)
                flag = True

            if flag:
                tokens.extend(word)
                flag = False
            else:
                tokens.append(word)    
            
    preProcessedTokens = pre_processing(tokens)
            
    doc.close()
    
    return preProcessedTokens

In [4]:
def pre_processing(tokens):
    
    stopWordsList = ['a','is','the','of','all','and','to','can','be','as','once','for','at','am','are','have','had','up','his','her','in','on','we','do','']
    
    preProcessedToken = []
    
    ps = PorterStemmer()
    
    #lemm = WordNetLemmatizer()
    
    for word in tokens:
        if word.casefold() not in stopWordsList:
            word = word.casefold()
            word = ps.stem(word)
            #word = lemm.lemmatize(word)
            preProcessedToken.append(word)
        
    return preProcessedToken

In [5]:
def make_dictionary(collection):

    dictionary = []

    for i in collection:
        for term in collection[i] :
            if term not in dictionary and term.isalpha() and len(term)>2:
                dictionary.append(term)

    dictionary.sort()
    
    return dictionary 

In [6]:
def calculate_tf(collection, dictionary):
    
    termFrequency = {}

    for token in dictionary:
        termFrequency[token] = []

        for i in collection:
            termFrequency[token].append(collection[i].count(token))
    
    return termFrequency

In [7]:
def calculate_idf(termFrequency, N):

    documentFrequency = {}

    for term in termFrequency:   
        documentFrequency[term] = 0

        for count in termFrequency[term]:
            if count > 0:
                documentFrequency[term] += 1

    idf = {}

    for term in documentFrequency:
        idf[term] = math.log10(N/documentFrequency[term])
        
    return idf

In [8]:
def tf_idf_scoring(termFrequency, idf):

    docScore = {}

    for term in termFrequency:
        docScore[term] = []

        for tf in termFrequency[term]:
            docScore[term].append(tf*idf[term])

    return docScore

In [1]:
"""
documentVector = [
    doc1 = ([score[term1],score[term2],...,score[term n]], class a)
    doc2 = ([score[term1],score[term2],...,score[term n]], class b)
    .....
    docN = ([score[term1],score[term2],...,score[term n]], class x)
]
"""

def get_document_vector(N,docScore,classes):
    
    documentVector = []
    k = 0
    j = classes[0][1]
    
    for i in range(N):
        document = []
        
        for term in docScore:
            document.append(docScore[term][i])
        
        if i < j :
            documentVector.append((document,classes[k][0]))
        else:
            j = classes[k+1][1] + i
            k += 1
            documentVector.append((document,classes[k][0]))
        
    return documentVector

In [293]:
def split_dataset(documentVector, N, classes):
    
    #Following documents will be used in testing set
    #Hard coded so that results will be stored in file KNN.txt to minimize throughput time 
    testDoc = [2, 4, 8, 9, 11, 16, 22, 26, 29, 33, 36, 37, 43, 46, 48, 51, 52, 57, 62, 65, 68, 72, 76, 79, 81, 82, 85, 90, 91, 94, 105, 108, 109, 111, 112, 114, 117, 119, 120, 122, 125, 132, 138, 144, 147, 151, 152, 153, 155, 156, 158, 173, 174, 175, 176, 184, 188, 192, 193, 198, 204, 209, 210, 213, 221, 222, 224, 228, 229, 230, 233, 239, 241, 246, 250, 252, 257, 259, 261, 275, 277, 279, 280, 283, 286, 287, 290, 292, 293, 295, 296, 298, 303, 306, 308, 309, 310, 313, 315, 323, 326, 327, 329, 330, 334, 340, 341, 347, 350, 352, 354, 361, 363, 369, 371, 375, 378, 379, 382, 391, 398, 403, 405, 409, 413, 419, 423, 424, 426, 431, 434, 437, 438, 439, 440, 442, 444, 446, 447, 449, 452, 459, 462, 464, 475, 487, 494, 495, 496, 498, 500, 506, 510, 522, 523, 531, 532, 533, 541, 543, 545, 551, 556, 557, 558, 560, 564, 568, 576, 577, 586, 590, 595, 603, 607, 610, 612, 614, 615, 617, 618, 619, 623, 624, 625, 626, 628, 630, 632, 634, 639, 645, 646, 648, 651, 652, 653, 656, 659, 669, 673, 677, 681, 686, 688, 690, 696, 698, 702, 705, 708, 709, 711, 714, 720, 722, 725, 726, 728, 735]
    
    #Following commented code is for generating random documents for test set, 
    #For using following code, file KNN.txt should be deleted manually from directory each time before running code
    #Classification will take atleast 10 minutes if generating random test set each time
    """"
    testDoc = []
    n = 0
    
    for i in range(len(classes)):
        splitCount = int(classes[i][1]*0.3)
        testDoc.extend(random.sample(range(n+1, classes[i][1]+n), splitCount))
        n += classes[i][1]
        
    testDoc.sort()
    """
    
    testSet = []
    trainSet = []
    
    for i in range(len(documentVector)):
        
        if i not in testDoc:
            trainSet.append(documentVector[i])
            
        else:
            testSet.append(documentVector[i])    
    
    return trainSet,testSet, testDoc  

In [309]:
def classification(trainingSet, testingSet):
    
    predicted_labels = []
    actual_labels = []
    
    k_labels = []
    
    if os.path.isfile('KNN.txt'):
        KNN = open('KNN.txt','r')
        
        for i in range(len(testingSet)):
            actual_labels.append(testingSet[i][1])
            
        for word in KNN:
            predicted_labels.append(word[:-1])
            
        KNN.close()
        
    else:
        KNN = open('KNN.txt','a')
        
        for i in range(len(testingSet)):
            cosSim = [] 
            actual_labels.append(testingSet[i][1])

            for j in range(len(trainingSet)): 
                cosSim.append(((dot(trainingSet[j][0], testingSet[i][0])/(norm(trainingSet[j][0])*norm(testingSet[i][0]))), trainingSet[j][1]))

            cosSim.sort(reverse = True)

            k_labels = cosSim[0:3]                    

            if (k_labels[0][1] == k_labels[1][1] == k_labels[2][1]) or (k_labels[2][1] == k_labels[1][1]) or (k_labels[0][1] == k_labels[2][1]) or (k_labels[0][1] != k_labels[1][1] != k_labels[2][1]):
                                   predicted_labels.append(k_labels[2][1])

            elif k_labels[0][1] == k_labels[1][1]:
                                   predicted_labels.append(k_labels[1][1])   
                    
        
            KNN.write(predicted_labels[i] + '\n')
        
        KNN.close()
        
    return actual_labels, predicted_labels


In [283]:
def check_accuracy(actual_label,predicted_label):
    
    correct = 0
    incorrect = 0

    for i in range(len(actual_label)):
            if actual_label[i] == predicted_label[i]:
                correct += 1
            else:
                incorrect += 1           

    print('Correct :',correct)
    print('Incorrect :',incorrect)
    print('Accuracy :',correct/len(actual_label))

In [311]:
def main():
    
    N,collection,classes = read_file()
    dictionary = make_dictionary(collection)
    termFrequency = calculate_tf(collection,dictionary)
    idf = calculate_idf(termFrequency,N)
    docScore = tf_idf_scoring(termFrequency, idf)
    documentVector = get_document_vector(N,docScore,classes)
    trainingSet, testingSet, testingDocumentsNo = split_dataset(documentVector,N, classes)
    actual_label, predicted_label = classification(trainingSet, testingSet)
    
    actualLabels = pd.DataFrame({'Actual Label' : actual_label})
    predictedLabels = pd.DataFrame({'Predicted Label' : predicted_label})
    testDoc = pd.DataFrame({'Test Documents' : testingDocumentsNo})
    result = testDoc.join(actualLabels).join(predictedLabels) 
    print(result)
    
    check_accuracy(actual_label,predicted_label)
    

In [312]:
main()
#Running code first time will take 10 to 15 minutes, then next time it will take 3 to 4 minutes

     Test Documents Actual Label Predicted Label
0                 2    athletics       athletics
1                 4    athletics       athletics
2                 8    athletics       athletics
3                 9    athletics       athletics
4                11    athletics       athletics
5                16    athletics       athletics
6                22    athletics       athletics
7                26    athletics       athletics
8                29    athletics       athletics
9                33    athletics       athletics
10               36    athletics       athletics
11               37    athletics       athletics
12               43    athletics       athletics
13               46    athletics       athletics
14               48    athletics       athletics
15               51    athletics       athletics
16               52    athletics       athletics
17               57    athletics       athletics
18               62    athletics       athletics
19               65 