In [297]:
import pandas as pd
import numpy as np
import time
from scipy.sparse import csr_matrix
import re
from nltk.stem.porter import *
from collections import defaultdict
import os.path
import math

In [298]:
STOPWORDS_PATH = './stopwords'
# replace all non-whitespace, digits, and other non-alphabetic characters
to_replace = re.compile(r'(?!\s)(\W|\d+)')
# split on any whitespace
whitespace_delimiters = re.compile('\s+')

In [299]:
# simple utility function to read stopwords from a list of stopwords files, 
# and combine them into one big set
def read_stopwords(files):
    stopwords = set()
    for words_file in files:
        with open(words_file) as f:
            for line in f.readlines():
                # strip out all the characters we don't want
                line = re.sub(to_replace, '', line)
                word = line.strip()
                if len(word) > 0:
                    stopwords.add(word.lower())
    
    return stopwords

In [300]:
# defines our set of stopwords to throw out
STOPWORDS = read_stopwords([os.path.join(STOPWORDS_PATH, p) for p in os.listdir(STOPWORDS_PATH)])

In [301]:
# Given a chunk of text, returns contiguous blocks of alphabetic characters (presumably, words)
def tokenize(text):
    stemmer = PorterStemmer()
    
    # strip off all the undesirable bits (punctuation, numbers, etc.)
    stripped = re.sub(to_replace, '', text)
    freqs = defaultdict(lambda: 0)
    for word in re.split(whitespace_delimiters, stripped):
        if word == '':
            continue
            
        # normalize all words to lowercase
        word = word.lower()
        # add stemmed word to frequency count if it is not a stopword
        if word not in STOPWORDS:
            stemmed = stemmer.stem(word)
            freqs[stemmed] += 1
            
        
    return freqs

In [319]:
# given the term frequencies for a single document (f_j), all document frequencies (d),
# the maximum frequencies for every keyword, and the positions word weights should be placed in,
# and the total number of documents,
# creates a vector which is the size of the vocabulary, and calculates the tf-idf weights
# for this particular set of frequencies.
def vectorize_document(freqs, doc_freqs, max_freqs, positions, n):
    vec = np.zeros(len(positions))
    for keyword in freqs:
        tf_idf = (freqs[keyword] / max_freqs[keyword]) * math.log2(n/doc_freqs[keyword])
        vec[positions[keyword]] = tf_idf
    
    return vec

# given doc_freqs, a dictionary of terms to the number of documents in which the terms occur,
# and name_to_freqs, a dict of dicts mapping of training file names to their respective word frequencies
# computes - for every keyword in doc_freqs - the maximum frequency of the word across all documents 
# we need this for normalizing the keyword frequencies when we vectorize
def max_term_frequencies(doc_freqs, name_to_freqs):
    max_freqs = {}
    for keyword in doc_freqs:
            # determine which document has the maximum frequency for a particular keyword
            max_freq_doc = max(name_to_freqs, 
                                    key=lambda doc_name: 0 if keyword not in name_to_freqs[doc_name] else 
                                        name_to_freqs[doc_name][keyword])
            max_freqs[keyword] = name_to_freqs[max_freq_doc][keyword]
    
    return max_freqs
            
def extract_words(data_file: str):
    with open(data_file, 'r') as f:
        return tokenize(f.read())

def vectorize_dataset(train_directory: str):
    # data file -> author mapping so we can create a ground truth file
    authors = {}
    # maps training file name -> document frequencies
    # essentially a dict of dict(keyword -> keyword count)
    name_to_freqs = {}
    doc_freqs = defaultdict(lambda: 0)
    total_documents = 0
    
    for author_dir in os.listdir(train_directory):
        name = author_dir
        fullpath = os.path.join(train_directory, author_dir)
        for data_file in os.listdir(fullpath):
            authors[data_file] = author_dir
            
            # determine the term frequency for all words in this data file
            freqs = extract_words(os.path.join(fullpath, data_file))
            
            # ensure that the total document frequency is incremented by one
            # for each of the words we found in the document
            for word in freqs:
                doc_freqs[word] += 1
            
            # store the word frequencies for this particular datafile
            name_to_freqs[data_file] = freqs
            
            total_documents += 1
    
    # create ground truth df
    ground_truth = pd.DataFrame(index=sorted(d.keys()))
    for (k, v) in authors.items():
        ground_truth.at[k, 'Author'] = v
    
    # compute the maximum frequencies for every term
    max_freqs = max_term_frequencies(doc_freqs, name_to_freqs)
    
    # we are creating a vector which is the length of our vocabulary set,
    # so we must assign each word a unique 'dimension' in this vector
    positions = dict(zip(sorted(doc_freqs), range(len(doc_freqs))))
    
    rows = []
    for train_file in name_to_freqs:
        v = vectorize_document(
            name_to_freqs[train_file], 
            doc_freqs, 
            max_freqs, 
            positions, 
            total_documents)
        rows.append(v)
    
    df = pd.DataFrame(rows, index=name_to_freqs.keys(),columns=list(range(len(doc_freqs))))
    return df

In [320]:
d = vectorize_dataset('./C50/C50train/')

In [321]:
d

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21712,21713,21714,21715,21716,21717,21718,21719,21720,21721
147604newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196812newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
219316newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251225newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177958newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224725newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
233590newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207278newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
236474newsML.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
