In [51]:
import os
import random
from string import punctuation

In [2]:
import numpy as np
import pandas as pd

In [3]:
data_dir = 'data/20_newsgroups/'

In [4]:
def get_targets(data_dir):
    # Assign target values to each of the classes in the dataset
    targets = {}
    for i, newsgroup in enumerate(os.listdir(data_dir)):
        targets[newsgroup] = i
    return targets

#### Assigning a target value to each document class

In [5]:
targets_dict = get_targets(data_dir)
targets_dict

{'alt.atheism': 0,
 'rec.autos': 1,
 'comp.windows.x': 2,
 'sci.med': 3,
 'sci.crypt': 4,
 'comp.os.ms-windows.misc': 5,
 'talk.politics.mideast': 6,
 'talk.politics.misc': 7,
 'sci.electronics': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'comp.graphics': 11,
 'sci.space': 12,
 'talk.politics.guns': 13,
 'comp.sys.mac.hardware': 14,
 'misc.forsale': 15,
 'talk.religion.misc': 16,
 'rec.motorcycles': 17,
 'comp.sys.ibm.pc.hardware': 18,
 'soc.religion.christian': 19}

In [6]:
def get_data_paths(data_dir):
    X_paths, Y = [], []
    targets_dict = get_targets(data_dir)
    for newsgroup_dir in os.listdir(data_dir):
        class_path = os.path.join(data_dir, newsgroup_dir)
        for text_file in os.listdir(class_path):
            X_paths.append(os.path.join(class_path, text_file))
            Y.append(targets_dict.get(newsgroup_dir))
            
    return X_paths, Y
    

In [7]:
X_paths, Y = get_data_paths(data_dir)

In [10]:
print(f'Total data samples: {len(Y)}')

Total data samples: 19997


#### Randomly checking if the data is correct or not

In [11]:
random.sample(X_paths, 5)

['data/20_newsgroups/talk.politics.misc/178869',
 'data/20_newsgroups/rec.sport.hockey/54132',
 'data/20_newsgroups/misc.forsale/75994',
 'data/20_newsgroups/sci.crypt/16067',
 'data/20_newsgroups/sci.crypt/15217']

In [12]:
random.sample(Y, 5)

[11, 12, 18, 19, 5]

In [29]:
def split_train_test(X, y, test_pct=0.5):
    total_len = len(y)
    train_len = int(test_pct*total_len)
    train_indices = random.sample(range(total_len), train_len)
    test_indices = [k for k in range(total_len) if k not in train_indices]
    X_train, y_train, X_test, y_test = [], [], [], []
    for i in train_indices:
        X_train.append(X[i])
        y_train.append(y[i])
        
    for i in test_indices:
        X_test.append(X[i])
        y_test.append(y[i])
    
    return X_train, y_train, X_test, y_test

In [28]:
X_train, y_train, X_test, y_test = split_train_test(X_paths, Y, test_pct=0.5)

9998
[15460, 10377, 1494, 6048, 14801, 12374, 4360, 6435, 15024, 986]
9999


In [18]:
print(f'Training samples: {len(y_train)} || Testing samples: {len(y_test)}')

Training samples: 9998 || Testing samples: 12073


#### Stop Words taken from NLTK corpora
- These words are very common and do not contribute much to the semantic meaning of a text document
- So, I am filtering out these words from the documents

In [33]:
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
 'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
 'each', 'few', 'for', 'from', 'further', 
 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
 "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
 'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
 "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
 "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
 '4th', '5th', '6th', '7th', '8th', '9th', '10th']

#### Remove headers from the document text
- There are a couple of line breaks after header information in each file
- So, check for that and remove everything above that

In [34]:
def remove_headers(lines):
    for i, line in enumerate(lines):
        if line == '\n':
            break
    return lines[i+1:]

#### Remove whitespaces and stop words from every line

In [50]:
def has_num(word):
    for i in range(10):
        if str(i) in word:
            return True
    return False

In [52]:
def remove_punctuations(word):
    all_punctuations = string.punctuation.replace("'", "")
    # Also, add tabs
    all_punctuations += '\t'
    table = str.maketrans('', '', all_punctuations)
    return word.translate(table)

In [57]:
def pre_process(words):
    """
    Takes in a list of words and applies some preprocessing
    1. Remove numbers from string
    2. Remove punctuations
    3. Remove quotes from words if present
    """
    processed_words = []
    for word in words:
        # Remove numbers from words
        if has_num(word):
            continue
        # Remove punctuations
        word = remove_punctuations(word)
        
        # Also check for quoted words and remove the quotes
        if word[0] in ["'", '"']:
            word = word[1:]
        if word[-1] in ["'", '"']:
            word = word[:-1]
            
        processed_words.append(word)
    
    return processed_words

In [58]:
def remove_stop_words(words):
    # Remove stop words from a list of words
    # Also, remove empty strings and single char strings
    return [word.lower() for word in words if len(word)>1 and word not in stop_words]

In [59]:
def validate_line(line):
    words = line.replace('\n', '').strip().split(' ')
    words = pre_process(words)
    return remove_stop_words(words)

In [66]:
def read_file(file_path):
    with open(file_path, 'rb') as file:
        lines = file.readlines()
        print(lines)
        valid_lines = remove_headers(lines)
        valid_words = []
        for line in valid_lines:
            valid_words.append(validate_line(line))
            
    return valid_words

In [67]:
read_file('data/20_newsgroups/alt.atheism/49960')

[b'Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49960 alt.atheism.moderated:713 news.answers:7054 alt.answers:126\n', b'Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!bb3.andrew.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!magnus.acs.ohio-state.edu!usenet.ins.cwru.edu!agate!spool.mu.edu!uunet!pipex!ibmpcug!mantis!mathew\n', b'From: mathew <mathew@mantis.co.uk>\n', b'Newsgroups: alt.atheism,alt.atheism.moderated,news.answers,alt.answers\n', b'Subject: Alt.Atheism FAQ: Atheist Resources\n', b'Summary: Books, addresses, music -- anything related to atheism\n', b'Keywords: FAQ, atheism, books, music, fiction, addresses, contacts\n', b'Message-ID: <19930329115719@mantis.co.uk>\n', b'Date: Mon, 29 Mar 1993 11:57:19 GMT\n', b'Expires: Thu, 29 Apr 1993 11:57:19 GMT\n', b'Followup-To: alt.atheism\n', b'Distribution: world\n', b'Organization: Mantis Consultants, Cambridge. UK.\n', b'Approved: news-answers-request@mit.edu\n', b'Supersedes: <19930301143317@mantis.co.uk>\n', b'Lines: 290\n'

[]