In [2]:
import nltk
import string

from os import listdir
from os.path import isfile, join

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
INPUT_DIR = "./data"

## Preprocessing

In [4]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(INPUT_DIR)
print("File Names:", fileNames, "\n", "File Paths:" ,filePaths)

File Names: ['news1.txt', 'news2.txt', 'news3.txt'] 
 File Paths: ['./data\\news1.txt', './data\\news2.txt', './data\\news3.txt']


In [5]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)

{'./data\\news1.txt': 'Channel tunnel operator Eurotunnel on Monday announced details of a deal giving bank creditors 45.5 percent of the company in return for wiping out 1.0 billion pounds ($1.6 billion) of its massive debts.\nThe long-awaited but highly complex restructuring of nearly nearly nine billion pounds of debt and unpaid interest throws the company a lifeline which could secure what is still likely to be a difficult future.\nThe deal, announced simultaneously in Paris and London, brings the company back from the brink of bankruptcy but leaves current shareholders, who have already seen their investment dwindle, owning only 54.5 percent of the company.\n"We have fixed and capped the interest payments and arranged only to pay what is available in cash," Eurotunnel co-chairman Alastair Morton told reporters at a news conference. "Avoiding having to do this again is the name of the game."\nMorton said the plan provides the Anglo-French company with the medium term financial stab

## Custom Tokenizer

Steps for tokenization:
- tokenize the input
- remove stop words
- perform stemming
- remove punctuations
- lowercase

In [6]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

In [7]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

In [8]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

In [9]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove. This is just a sample
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

In [10]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

In [11]:
content_test = rawContentDict[filePaths[0]]
print("Raw Content:", content_test[:300])

Raw Content: Channel tunnel operator Eurotunnel on Monday announced details of a deal giving bank creditors 45.5 percent of the company in return for wiping out 1.0 billion pounds ($1.6 billion) of its massive debts.
The long-awaited but highly complex restructuring of nearly nearly nine billion pounds of debt a


In [12]:
content_test_tokenized = tokenizeContent(content_test)
print("Tokenized content:", content_test_tokenized[:30])

Tokenized content: ['Channel', 'tunnel', 'operator', 'Eurotunnel', 'on', 'Monday', 'announced', 'details', 'of', 'a', 'deal', 'giving', 'bank', 'creditors', '45.5', 'percent', 'of', 'the', 'company', 'in', 'return', 'for', 'wiping', 'out', '1.0', 'billion', 'pounds', '(', '$', '1.6']


In [13]:
nltk.download('stopwords')
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)
print("After stop word removal:", content_test_rmStop[:30])

After stop word removal: ['Channel', 'tunnel', 'operator', 'Eurotunnel', 'Monday', 'announced', 'details', 'deal', 'giving', 'bank', 'creditors', '45.5', 'percent', 'company', 'return', 'wiping', '1.0', 'billion', 'pounds', '(', '$', '1.6', 'billion', ')', 'massive', 'debts', '.', 'The', 'long-awaited', 'highly']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Karishma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)
print("After stemming:", content_test_stemmed[:30])

After stemming: ['channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'detail', 'deal', 'give', 'bank', 'creditor', '45.5', 'percent', 'compani', 'return', 'wipe', '1.0', 'billion', 'pound', '(', '$', '1.6', 'billion', ')', 'massiv', 'debt', '.', 'the', 'long-await', 'highli']


In [15]:
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print("After lowercasing:", content_test_clean_lower[:30])

After lowercasing: ['channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'detail', 'deal', 'give', 'bank', 'creditor', '45.5', 'percent', 'compani', 'return', 'wipe', '1.0', 'billion', 'pound', '1.6', 'billion', 'massiv', 'debt', 'the', 'long-await', 'highli', 'complex', 'restructur', 'nearli', 'nearli']


In [16]:
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)    
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Output
Handles both TFIDF and Cosine Similarity 

In [17]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all(term, values, fileNames):
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    print('                ', end="")   #bank space for formatting output
    for n in range(len(fileNames)):
        print('{0:18}'.format(fileNames[n]), end="")    #file names
    print()
    for i in range(len(term)):
        print('{0:8}'.format(term[i]), end='\t|  ')     #the term
        for j in range(numValues):
            print('{0:.12f}'.format(values[i][j]), end='   ') #the value, corresponding to the file name, for the term
        print()

In [18]:
# write TFIDF values in 'table' format
def write_TFIDF_for_all(term, values, fileNames):
    filePath = "../results/tfid.txt"
    outFile = open(filePath, 'a')
    title = "TFIDF\n"
    outFile.write(title)
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    outFile.write('               \t')   #bank space for formatting output
    for n in range(len(fileNames)):
        outFile.write('{0:18}'.format(fileNames[n]))    #file names
    outFile.write("\n")
    for i in range(len(term)):
        outFile.write('{0:15}'.format(term[i]))     #the term
        outFile.write('\t|  ')
        for j in range(numValues):
            outFile.write('{0:.12f}'.format(values[i][j])) #the value, corresponding to the file name, for the term
            outFile.write('   ')
        outFile.write("\n")

    outFile.close()