In [1]:
import codecs
import os
import random
from string import punctuation

In [2]:
import numpy as np
import pandas as pd

In [3]:
data_dir = 'data/20_newsgroups/'

In [4]:
def get_targets(data_dir):
    # Assign target values to each of the classes in the dataset
    targets = {}
    for i, newsgroup in enumerate(os.listdir(data_dir)):
        targets[newsgroup] = i
    return targets

#### Assigning a target value to each document class

In [5]:
targets_dict = get_targets(data_dir)
targets_dict

{'alt.atheism': 0,
 'rec.autos': 1,
 'comp.windows.x': 2,
 'sci.med': 3,
 'sci.crypt': 4,
 'comp.os.ms-windows.misc': 5,
 'talk.politics.mideast': 6,
 'talk.politics.misc': 7,
 'sci.electronics': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'comp.graphics': 11,
 'sci.space': 12,
 'talk.politics.guns': 13,
 'comp.sys.mac.hardware': 14,
 'misc.forsale': 15,
 'talk.religion.misc': 16,
 'rec.motorcycles': 17,
 'comp.sys.ibm.pc.hardware': 18,
 'soc.religion.christian': 19}

In [6]:
def get_data_paths(data_dir):
    X_paths, Y = [], []
    targets_dict = get_targets(data_dir)
    for newsgroup_dir in os.listdir(data_dir):
        class_path = os.path.join(data_dir, newsgroup_dir)
        for text_file in os.listdir(class_path):
            X_paths.append(os.path.join(class_path, text_file))
            Y.append(targets_dict.get(newsgroup_dir))
            
    return X_paths, Y
    

In [7]:
X_paths, Y = get_data_paths(data_dir)

In [8]:
print(f'Total data samples: {len(Y)}')

Total data samples: 19997


#### Randomly checking if the data is correct or not

In [9]:
random.sample(X_paths, 5)

['data/20_newsgroups/talk.religion.misc/83722',
 'data/20_newsgroups/sci.electronics/53743',
 'data/20_newsgroups/sci.med/58894',
 'data/20_newsgroups/rec.motorcycles/105048',
 'data/20_newsgroups/sci.med/58084']

In [10]:
random.sample(Y, 5)

[14, 10, 13, 15, 17]

In [11]:
def split_train_test(X, y, test_pct=0.5):
    total_len = len(y)
    train_len = int(test_pct*total_len)
    train_indices = random.sample(range(total_len), train_len)
    test_indices = [k for k in range(total_len) if k not in train_indices]
    X_train, y_train, X_test, y_test = [], [], [], []
    for i in train_indices:
        X_train.append(X[i])
        y_train.append(y[i])
        
    for i in test_indices:
        X_test.append(X[i])
        y_test.append(y[i])
    
    return X_train, y_train, X_test, y_test

In [12]:
X_train, y_train, X_test, y_test = split_train_test(X_paths, Y, test_pct=0.5)

In [13]:
print(f'Training samples: {len(y_train)} || Testing samples: {len(y_test)}')

Training samples: 9998 || Testing samples: 9999


#### Stop Words taken from NLTK corpora
- These words are very common and do not contribute much to the semantic meaning of a text document
- So, I am filtering out these words from the documents

In [14]:
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
 'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
 'each', 'few', 'for', 'from', 'further', 
 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
 "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
 'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
 "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
 "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
 '4th', '5th', '6th', '7th', '8th', '9th', '10th']

#### Remove headers from the document text
- There are a couple of line breaks after header information in each file
- So, check for that and remove everything above that

In [15]:
def remove_headers(lines):
    for i, line in enumerate(lines):
        # First make sure that the bytecodes read is decoded
        line = line.decode(encoding='utf-8')
        if line == '\n':
            break
    return lines[i+1:]

#### Remove whitespaces and stop words from every line

In [25]:
def remove_digits(word):
    for i in range(10):
        word = word.replace(str(i), '')
    return word

In [17]:
def remove_punctuations(word):
    all_punctuations = punctuation.replace("'", "")
    # Also, add tabs
    all_punctuations += '\t'
    table = str.maketrans('', '', all_punctuations)
    return word.translate(table)

In [55]:
def pre_process(words):
    """
    Takes in a list of words and applies some preprocessing
    1. Remove numbers from string
    2. Remove punctuations
    3. Remove quotes from words if present
    """
    processed_words = []
    for word in words:
        # Remove numbers from words
        word = remove_digits(word)

        # Remove punctuations
        word = remove_punctuations(word)

        # Do not process empty or one character strings
        if len(word) < 2:
            continue

        # Also check for quoted words and remove the quotes
        if word[0] in ["'", '"']:
            word = word[1:]
        if word[-1] in ["'", '"']:
            word = word[:-1]
            
        processed_words.append(word)
    
    return processed_words

In [29]:
def remove_stop_words(words):
    # Remove stop words from a list of words
    # Also, remove empty strings and single char strings
    return [word.lower() for word in words if len(word)>1 and word not in stop_words]

In [43]:
def validate_line(line):
    # Return a list of valid words
    words = line.replace('\n', '').strip().split(' ')
    words = pre_process(words)
    return words

In [62]:
def read_file(file_path):
    try:
        with open(file_path, 'rb') as file:
            lines = file.readlines()
        valid_lines = remove_headers(lines)
        valid_words = []
        for line in valid_lines:
            # Decode byte words to string on each line
            line = line.decode(encoding='utf-8')
            processed_line = validate_line(line)
            for word in processed_line:
                word = word.lower()
                if len(word) > 1 and word not in stop_words:
                    valid_words.append(word)
                    
    except Exception as error:
        # print(f'ERROR: {error} || FILE_NAME: {file_path}')
        return [], 1
    
    return valid_words, 0

In [45]:
read_file('data/20_newsgroups/alt.atheism/54238')

['article',
 'cvmrzdarksideosrheuoknoredu',
 'bilokcforumosrheedu',
 'bill',
 'conner',
 'writes',
 'myth',
 'refer',
 'convoluted',
 'counterfeit',
 'athiests',
 'created',
 'make',
 'religion',
 'appear',
 'absurd',
 'rather',
 'approach',
 'religion',
 'including',
 'christainity',
 'rational',
 'manner',
 'debating',
 'claims',
 'stated',
 'atheists',
 'concoct',
 'outrageous',
 'parodies',
 'hold',
 'religious',
 'accountable',
 'beliefs',
 'accurately',
 'oxymoric',
 'term',
 'like',
 'reasonable',
 'atheist',
 'religious',
 'parodies',
 'atheistic',
 'paradies',
 'please',
 'substantiate',
 'parodies',
 'outrageous',
 'specifically',
 'iup',
 'outrageous',
 'many',
 'religions',
 'private',
 'note',
 'jennifer',
 'fakult',
 'post',
 'may',
 'contain',
 'following',
 'sarcasm',
 'cycnicism',
 'irony',
 'humor',
 'please',
 'aware',
 'possibility',
 'allow',
 'confused',
 'andor',
 'thrown',
 'loop',
 'doubt',
 'assume',
 'owners',
 'account',
 'take',
 'responsiblity',
 'confusio

## Selecting features for the dataset

In [63]:
def get_features(X, n_features=4000):
    """Goes through the entire training set and gets top "n_features" words appeared in the documents along with their frequencies"""
    all_words = []
    file_errors = 0
    for file_path in X:
        words, has_error = read_file(file_path)
        file_errors += has_error
        for w in words:
            all_words.append(w)
            
    words, counts = np.unique(np.array(all_words), return_counts=True)
    freq, words = (list(i) for i in zip(*sorted(zip(counts, words), reverse=True)))
    print(len(words), words[:10], freq[:10])
    print(f'Total file encoding errors: {file_errors}')
    
    # Return the top 4000 words in the whole dataset
    return words[:n_features]

In [66]:
feature_words = get_features(X_train, n_features=4000)

98680 ['writes', 'article', 'just', 'like', 'people', 'know', 'get', 'think', 'also', 'use'] [7291, 6225, 4969, 4920, 4897, 4534, 4163, 3988, 3642, 3241]
Total file encoding errors: 34


In [67]:
def doc_word_freq(X):
    """
        Returns a list of dictionaries that contain the frequencies of words in each document
        --> [{'word1': 3, ...} ...]
    """
    word_freq = []
    for file_path in X:
        words, has_error = read_file(file_path)
        words, counts = np.unique(np.array(words), return_counts=True)
        word_counts = {}
        for i, word in enumerate(words):
            word_counts[word] = counts[i]
        
        word_freq.append(word_counts)
    return word_freq

In [68]:
def create_data(X, feature_words):
    X_data = []
    word_freq = doc_word_freq(X)
    for doc_words in word_freq:
        # doc_words is a dict that contains words in that document along with their number of appearences
        doc_data = []
        for f_word in feature_words:
            if f_word in doc_words.keys():
                # Add the frequency for the word to create a feature vector for training set
                doc_data.append(doc_words[f_word])
            else:
                doc_data.append(0)
        X_data.append(doc_data)
    return np.array(X_data)

In [69]:
X_train_np = create_data(X_train, feature_words)

In [72]:
X_train_np[0][:100]

array([1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 3, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [73]:
X_test_np = create_data(X_test, feature_words)

In [74]:
print(len(X_train_np), len(X_test_np))

9998 9999
