In [None]:
# Jupyter Notebook to parse 20_newsgroup data set
# Some code taken from https://github.com/gokriznastic/20-newsgroups_text-classification/blob/master/Multinomial%20Naive%20Bayes-%20BOW%20with%20TF.ipynb

from os import listdir
from os.path import isfile, join
import string
import numpy as np
from random import seed
from random import random
from random import randint
from datetime import datetime

In [49]:
# function to parse 20 newsgroup data set 
# input into function is the group we want to focus on - if group == 'sci.space', bags with sci.space will be labeled positive
def parse_20newsgroups(group): 
    # set parameters
    num_bags = 100
    num_positive_bags = 50
    num_instances = 50 
    num_features = 200 
    positivity_rate = 0.03
    
    my_path = '20_newsgroups'
    
    #creating a list of folder names to make valid pathnames later
    folders = [f for f in listdir(my_path)]
    
    
    #creating a 2D list to store list of all files in different folders
    files = []
    for folder_name in folders:
        folder_path = join(my_path, folder_name)
        files.append([f for f in listdir(folder_path)])
    
    
    #creating a list of pathnames of all the documents
    #this would serve to split our dataset into train & test later without any bias
    pathname_list = []
    for fo in range(len(folders)):
        for fi in files[fo]:
            pathname_list.append(join(my_path, join(folders[fo], fi)))
            
    #making an array containing the classes each of the documents belong to
    Y = []
    for folder_name in folders:
        folder_path = join(my_path, folder_name)
        num_of_files= len(listdir(folder_path))
        for i in range(num_of_files):
            Y.append(folder_name)
            
    #choose documents 
    # we start with the positive bags
    seed(datetime.now())
    pathnames = []
    # put all positive docs into one list and negative docs in another
    positive_docs = [pathname_list[idx] for idx, element in enumerate(pathname_list) if Y[idx] == group]
    negative_docs = [pathname_list[idx] for idx, element in enumerate(pathname_list) if Y[idx] != group]
    for i in range(num_positive_bags):
        path_bag = []
        # select one positive bag
        random_index = randint(0, len(positive_docs)-1)
        current_file = positive_docs[random_index]
        positive_docs.pop(random_index) # remove the file we just used, so we don't use it again       
        path_bag.append(current_file)
        
        # select the rest of the bags, with positivity rate specified at the top of the function
        for j in range(num_instances - 1):
            if(random() < positivity_rate): # insert positive instance
                random_index = randint(0, len(positive_docs)-1)
                current_file = positive_docs[random_index]
                positive_docs.pop(random_index) # remove the file we just used, so we don't use it again         
                path_bag.append(current_file)
            else: # insert a negative instance
                random_index = randint(0, len(negative_docs)-1)
                current_file = negative_docs[random_index]
                negative_docs.pop(random_index) # remove the file we just used, so we don't use it again       
                path_bag.append(current_file)
        pathnames.append(path_bag[:])
                
    # create negative bags
    for i in range(num_bags - num_positive_bags):
        path_bag = []
        for j in range(num_instances):
            random_index = randint(0, len(negative_docs)-1)
            current_file = negative_docs[random_index]
            negative_docs.pop(random_index) # remove the file we just used, so we don't use it again       
            path_bag.append(current_file)
        pathnames.append(path_bag[:])
    
    #create vocabulary
    vocab = create_vocabulary(num_features, pathnames)
    
    #create bags
    bags = []
    bag = []
    instance = []
    for paths in pathnames: # for each bag of paths in pathnames
        bag = []
        for path in paths: # for each path in the bag of paths
            instance = make_features(vocab, path)
            bag.append(instance[:])
        bags.append(bag[:])
        
    # Create Labels
    labels = []
    for i in range(num_positive_bags):
        labels.append(1)
    for j in range(num_bags - num_positive_bags):
        labels.append(-1)
        
    return bags, labels, pathnames, vocab

In [None]:
#creates the list of words we will use
def create_vocabulary(num_features, pathnames):
    all_words = []
    for paths in pathnames:
        for path in paths:
            f = open(path, "r")
            text_lines = f.readlines()
            text_lines = remove_metadata(text_lines)
        
            #traverse over all the lines and tokenize each one with the help of helper function: tokenize_sentence
            for line in text_lines:
                all_words.append(tokenize_sentence(line))
    
    # turn words into np array for further processing
    np_all_words = np.asarray(flatten(all_words))
    
    # find unique words and their frequency
    words, counts = np.unique(np_all_words, return_counts=True)
    
    # sort words based off their frequency
    freq, wrds = (list(i) for i in zip(*(sorted(zip(counts, words), reverse=True))))
    
    # choose n number of top words
    vocab = wrds[0:num_features]
    
    return vocab

In [None]:
#make the features for the given pathname
def make_features(vocab, pathname):
    f = open(pathname, "r")
    
    text_lines = f.readlines()
    text_lines = remove_metadata(text_lines)
    
    doc_words = []
    
    #traverse over all the lines and tokenize each one with the help of helper function: tokenize_sentence
    for line in text_lines:
        doc_words.append(tokenize_sentence(line))
        
    # turn words into np array for further processing
    np_doc_words = np.asarray(flatten(doc_words))
    
    # find unique words and their frequency
    words, counts = np.unique(np_doc_words, return_counts=True)    
    
    # create dictionary words -> counts
    dictionary = dict(zip(words,counts))
    
    # create features
    features = []
    for i in range(len(vocab)):
        features.append(dictionary.get(vocab[i], 0))
    
    return features

In [None]:
#function to remove metadata
def remove_metadata(lines):
    for i in range(len(lines)):
        if(lines[i] == '\n'):
            start = i+1
            break
    new_lines = lines[start:]
    return new_lines

In [None]:
#function to convert a sentence into list of words
def tokenize_sentence(line):
    words = line[0:len(line)-1].strip().split(" ")
    words = preprocess(words)
    words = remove_stopwords(words)
    
    return words

In [None]:
#function to preprocess the words list to remove punctuations

def preprocess(words):
    #we'll make use of python's translate function,that maps one set of characters to another
    #we create an empty mapping table, the third argument allows us to list all of the characters 
    #to remove during the translation process
    
    #first we will try to filter out some  unnecessary data like tabs
    table = str.maketrans('', '', '\t')
    words = [word.translate(table) for word in words]
    
    punctuations = (string.punctuation).replace("'", "") 
    # the character: ' appears in a lot of stopwords and changes meaning of words if removed
    #hence it is removed from the list of symbols that are to be discarded from the documents
    trans_table = str.maketrans('', '', punctuations)
    stripped_words = [word.translate(trans_table) for word in words]
    
    #some white spaces may be added to the list of words, due to the translate function & nature of our documents
    #we remove them below
    words = [str for str in stripped_words if str]
    
    #some words are quoted in the documents & as we have not removed ' to maintain the integrity of some stopwords
    #we try to unquote such words below
    p_words = []
    for word in words:
        if (word[0] and word[len(word)-1] == "'"):
            word = word[1:len(word)-1]
        elif(word[0] == "'"):
            word = word[1:len(word)]
        else:
            word = word
        p_words.append(word)
    
    words = p_words.copy()
        
    #we will also remove just-numeric strings as they do not have any significant meaning in text classification
    words = [word for word in words if not word.isdigit()]
    
    #we will also remove single character strings
    words = [word for word in words if not len(word) == 1]
    
    #after removal of so many characters it may happen that some strings have become blank, we remove those
    words = [str for str in words if str]
    
    #we also normalize the cases of our words
    words = [word.lower() for word in words]
    
    #we try to remove words with only 2 characters
    words = [word for word in words if len(word) > 2]
    
    return words

In [None]:
#function to remove stopwords
def remove_stopwords(words):
    stopwords = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
     'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
     'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
     'each', 'few', 'for', 'from', 'further', 
     'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
     'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
     'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
     "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
     'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
     'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
     'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
     "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
     'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
     "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
     'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
     'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
     '4th', '5th', '6th', '7th', '8th', '9th', '10th']
    words = [word for word in words if not word in stopwords]
    return words

In [None]:
#a simple helper function to convert a 2D array to 1D, without using numpy
def flatten(list):
    new_list = []
    for i in list:
        for j in i:
            new_list.append(j)
    return new_list

In [50]:
time = datetime.now()
bags, labels, pathnames, vocab = parse_20newsgroups('alt.atheism')
time = datetime.now() - time

In [51]:
print("# bags: " + str(len(bags)))
print("# instances: " + str(len(bags[0])))
print("# features: " + str(len(bags[0][0])))
print("# labels: " + str(len(labels)))
print("# of paths per bag: " + str(len(pathnames[0]))) # This should be equal to the # of instances
print("time taken: " + str(time))

print(bags[2][24]) # print out a random instance to see what the instances look like

# bags: 100
# instances: 50
# features: 200
# labels: 100
# of paths per bag: 50
time taken: 0:00:15.663946
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
