In [297]:
import pandas as pd
import numpy as np
import time
from scipy.sparse import csr_matrix
import re
from nltk.stem.porter import *
from collections import defaultdict
import os.path
import math

In [298]:
STOPWORDS_PATH = './stopwords'
# replace all non-whitespace, digits, and other non-alphabetic characters
to_replace = re.compile(r'(?!\s)(\W|\d+)')
# split on any whitespace
whitespace_delimiters = re.compile('\s+')

In [299]:
# simple utility function to read stopwords from a list of stopwords files, 
# and combine them into one big set
def read_stopwords(files):
    stopwords = set()
    for words_file in files:
        with open(words_file) as f:
            for line in f.readlines():
                # strip out all the characters we don't want
                line = re.sub(to_replace, '', line)
                word = line.strip()
                if len(word) > 0:
                    stopwords.add(word.lower())
    
    return stopwords

In [300]:
# defines our set of stopwords to throw out
STOPWORDS = read_stopwords([os.path.join(STOPWORDS_PATH, p) for p in os.listdir(STOPWORDS_PATH)])

In [301]:
# Given a chunk of text, returns contiguous blocks of alphabetic characters (presumably, words)
def tokenize(text):
    stemmer = PorterStemmer()
    
    # strip off all the undesirable bits (punctuation, numbers, etc.)
    stripped = re.sub(to_replace, '', text)
    freqs = defaultdict(lambda: 0)
    for word in re.split(whitespace_delimiters, stripped):
        if word == '':
            continue
            
        # normalize all words to lowercase
        word = word.lower()
        # add stemmed word to frequency count if it is not a stopword
        if word not in STOPWORDS:
            stemmed = stemmer.stem(word)
            freqs[stemmed] += 1
            
        
    return freqs

In [306]:
# given the term frequencies for a single document (f_j), all document frequencies (d),
# the maximum frequencies for every keyword, and the positions word weights should be placed in,
# and the total number of documents,
# creates a vector which is the size of the vocabulary, and calculates the tf-idf weights
# for this particular set of frequencies.
def vectorize_document(freqs, doc_freqs, max_freqs, positions, n):
    vec = np.zeros(len(positions))
    for keyword in freqs:
        tf_idf = (freqs[keyword] / max_freqs[keyword]) * math.log2(n/doc_freqs[keyword])
        vec[positions[keyword]] = tf_idf
    
    return vec

# given doc_freqs, a dictionary of terms to the number of documents in which the terms occur,
# and name_to_freqs, a dict of dicts mapping of training file names to their respective word frequencies
# computes - for every keyword in doc_freqs - the maximum frequency of the word across all documents 
# we need this for normalizing the keyword frequencies when we vectorize
def max_term_frequencies(doc_freqs, name_to_freqs):
    max_freqs = {}
    for keyword in doc_freqs:
            # determine which document has the maximum frequency for a particular keyword
            max_freq_doc = max(name_to_freqs, 
                                    key=lambda doc_name: 0 if keyword not in name_to_freqs[doc_name] else 
                                        name_to_freqs[doc_name][keyword])
            max_freqs[keyword] = name_to_freqs[max_freq_doc][keyword]
    
    return max_freqs
            
def extract_words(data_file: str):
    with open(data_file, 'r') as f:
        return tokenize(f.read())

def vectorize_dataset(train_directory: str):
    # data file -> author mapping so we can create a ground truth file
    authors = {}
    # maps training file name -> document frequencies
    # essentially a dict of dict(keyword -> keyword count)
    name_to_freqs = {}
    doc_freqs = defaultdict(lambda: 0)
    total_documents = 0
    
    for author_dir in os.listdir(train_directory):
        name = author_dir
        fullpath = os.path.join(train_directory, author_dir)
        for data_file in os.listdir(fullpath):
            authors[data_file] = author_dir
            
            # determine the term frequency for all words in this data file
            freqs = extract_words(os.path.join(fullpath, data_file))
            
            # ensure that the total document frequency is incremented by one
            # for each of the words we found in the document
            for word in freqs:
                doc_freqs[word] += 1
            
            # store the word frequencies for this particular datafile
            name_to_freqs[data_file] = freqs
            
            total_documents += 1
    
    # create ground truth df
    ground_truth = pd.DataFrame(index=sorted(d.keys()))
    for (k, v) in authors.items():
        ground_truth.at[k, 'Author'] = v
    
    # compute the maximum frequencies for every term
    max_freqs = max_term_frequencies(doc_freqs, name_to_freqs)
    
    # we are creating a vector which is the length of our vocabulary set,
    # so we must assign each word a unique 'dimension' in this vector
    positions = dict(zip(sorted(doc_freqs), range(len(doc_freqs))))
    
    rows = []
    for train_file in name_to_freqs:
        v = vectorize_document(
            name_to_freqs[train_file], 
            doc_freqs, 
            max_freqs, 
            positions, 
            total_documents)
        rows.append(v)
    
    df = pd.DataFrame(rows, index=name_to_freqs.keys(),columns=list(range(len(doc_freqs))))
    return df

In [307]:
d = vectorize_dataset('./C50/C50train/')

processed: 0 / 2500
processed: 1 / 2500
processed: 2 / 2500
processed: 3 / 2500
processed: 4 / 2500
processed: 5 / 2500
processed: 6 / 2500
processed: 7 / 2500
processed: 8 / 2500
processed: 9 / 2500
processed: 10 / 2500
processed: 11 / 2500
processed: 12 / 2500
processed: 13 / 2500
processed: 14 / 2500
processed: 15 / 2500
processed: 16 / 2500
processed: 17 / 2500
processed: 18 / 2500
processed: 19 / 2500
processed: 20 / 2500
processed: 21 / 2500
processed: 22 / 2500
processed: 23 / 2500
processed: 24 / 2500
processed: 25 / 2500
processed: 26 / 2500
processed: 27 / 2500
processed: 28 / 2500
processed: 29 / 2500
processed: 30 / 2500
processed: 31 / 2500
processed: 32 / 2500
processed: 33 / 2500
processed: 34 / 2500
processed: 35 / 2500
processed: 36 / 2500
processed: 37 / 2500
processed: 38 / 2500
processed: 39 / 2500
processed: 40 / 2500
processed: 41 / 2500
processed: 42 / 2500
processed: 43 / 2500
processed: 44 / 2500
processed: 45 / 2500
processed: 46 / 2500
processed: 47 / 2500
pr

processed: 813 / 2500
processed: 814 / 2500
processed: 815 / 2500
processed: 816 / 2500
processed: 817 / 2500
processed: 818 / 2500
processed: 819 / 2500
processed: 820 / 2500
processed: 821 / 2500
processed: 822 / 2500
processed: 823 / 2500
processed: 824 / 2500
processed: 825 / 2500
processed: 826 / 2500
processed: 827 / 2500
processed: 828 / 2500
processed: 829 / 2500
processed: 830 / 2500
processed: 831 / 2500
processed: 832 / 2500
processed: 833 / 2500
processed: 834 / 2500
processed: 835 / 2500
processed: 836 / 2500
processed: 837 / 2500
processed: 838 / 2500
processed: 839 / 2500
processed: 840 / 2500
processed: 841 / 2500
processed: 842 / 2500
processed: 843 / 2500
processed: 844 / 2500
processed: 845 / 2500
processed: 846 / 2500
processed: 847 / 2500
processed: 848 / 2500
processed: 849 / 2500
processed: 850 / 2500
processed: 851 / 2500
processed: 852 / 2500
processed: 853 / 2500
processed: 854 / 2500
processed: 855 / 2500
processed: 856 / 2500
processed: 857 / 2500
processed:

processed: 1384 / 2500
processed: 1385 / 2500
processed: 1386 / 2500
processed: 1387 / 2500
processed: 1388 / 2500
processed: 1389 / 2500
processed: 1390 / 2500
processed: 1391 / 2500
processed: 1392 / 2500
processed: 1393 / 2500
processed: 1394 / 2500
processed: 1395 / 2500
processed: 1396 / 2500
processed: 1397 / 2500
processed: 1398 / 2500
processed: 1399 / 2500
processed: 1400 / 2500
processed: 1401 / 2500
processed: 1402 / 2500
processed: 1403 / 2500
processed: 1404 / 2500
processed: 1405 / 2500
processed: 1406 / 2500
processed: 1407 / 2500
processed: 1408 / 2500
processed: 1409 / 2500
processed: 1410 / 2500
processed: 1411 / 2500
processed: 1412 / 2500
processed: 1413 / 2500
processed: 1414 / 2500
processed: 1415 / 2500
processed: 1416 / 2500
processed: 1417 / 2500
processed: 1418 / 2500
processed: 1419 / 2500
processed: 1420 / 2500
processed: 1421 / 2500
processed: 1422 / 2500
processed: 1423 / 2500
processed: 1424 / 2500
processed: 1425 / 2500
processed: 1426 / 2500
processed: 

processed: 1901 / 2500
processed: 1902 / 2500
processed: 1903 / 2500
processed: 1904 / 2500
processed: 1905 / 2500
processed: 1906 / 2500
processed: 1907 / 2500
processed: 1908 / 2500
processed: 1909 / 2500
processed: 1910 / 2500
processed: 1911 / 2500
processed: 1912 / 2500
processed: 1913 / 2500
processed: 1914 / 2500
processed: 1915 / 2500
processed: 1916 / 2500
processed: 1917 / 2500
processed: 1918 / 2500
processed: 1919 / 2500
processed: 1920 / 2500
processed: 1921 / 2500
processed: 1922 / 2500
processed: 1923 / 2500
processed: 1924 / 2500
processed: 1925 / 2500
processed: 1926 / 2500
processed: 1927 / 2500
processed: 1928 / 2500
processed: 1929 / 2500
processed: 1930 / 2500
processed: 1931 / 2500
processed: 1932 / 2500
processed: 1933 / 2500
processed: 1934 / 2500
processed: 1935 / 2500
processed: 1936 / 2500
processed: 1937 / 2500
processed: 1938 / 2500
processed: 1939 / 2500
processed: 1940 / 2500
processed: 1941 / 2500
processed: 1942 / 2500
processed: 1943 / 2500
processed: 