In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import os
import random
from string import punctuation

In [3]:
import numpy as np
import pandas as pd

In [4]:
data_dir = 'data/20_newsgroups/'

In [5]:
def get_targets(data_dir):
    # Assign target values to each of the classes in the dataset
    targets = {}
    for i, newsgroup in enumerate(os.listdir(data_dir)):
        targets[newsgroup] = i
    return targets

# Assigning a target value to each document class

In [6]:
targets_dict = get_targets(data_dir)
targets_dict

{'alt.atheism': 0,
 'rec.autos': 1,
 'comp.windows.x': 2,
 'sci.med': 3,
 'sci.crypt': 4,
 'comp.os.ms-windows.misc': 5,
 'talk.politics.mideast': 6,
 'talk.politics.misc': 7,
 'sci.electronics': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'comp.graphics': 11,
 'sci.space': 12,
 'talk.politics.guns': 13,
 'comp.sys.mac.hardware': 14,
 'misc.forsale': 15,
 'talk.religion.misc': 16,
 'rec.motorcycles': 17,
 'comp.sys.ibm.pc.hardware': 18,
 'soc.religion.christian': 19}

In [7]:
def get_data_paths(data_dir):
    X_paths, Y = [], []
    targets_dict = get_targets(data_dir)
    for newsgroup_dir in os.listdir(data_dir):
        class_path = os.path.join(data_dir, newsgroup_dir)
        for text_file in os.listdir(class_path):
            file_path = os.path.join(class_path, text_file)
            try:
                with open(file_path, 'r') as fp:
                    x = fp.readlines()
            except UnicodeDecodeError:
                print(f'DecodeError, ignoring -- {file_path}')
                os.remove(file_path)
                continue
            X_paths.append(file_path)
            Y.append(targets_dict.get(newsgroup_dir))
            
    return X_paths, Y
    

In [8]:
X_paths, Y = get_data_paths(data_dir)

In [9]:
print(f'Total data samples: {len(Y)}')

Total data samples: 19924


#### Randomly checking if the data is correct or not

In [10]:
random.sample(X_paths, 5)

['data/20_newsgroups/misc.forsale/74749',
 'data/20_newsgroups/talk.politics.guns/53324',
 'data/20_newsgroups/talk.politics.misc/178729',
 'data/20_newsgroups/comp.graphics/38571',
 'data/20_newsgroups/comp.os.ms-windows.misc/10072']

In [11]:
random.sample(Y, 5)

[1, 19, 7, 11, 3]

In [12]:
def split_train_test(X, y, test_pct=0.5):
    total_len = len(y)
    train_len = int(test_pct*total_len)
    train_indices = random.sample(range(total_len), train_len)
    test_indices = [k for k in range(total_len) if k not in train_indices]
    X_train, y_train, X_test, y_test = [], [], [], []
    for i in train_indices:
        X_train.append(X[i])
        y_train.append(y[i])
        
    for i in test_indices:
        X_test.append(X[i])
        y_test.append(y[i])
    
    return X_train, y_train, X_test, y_test

#### Stop Words taken from NLTK corpora
- These words are very common and do not contribute much to the semantic meaning of a text document
- So, I am filtering out these words from the documents

In [13]:
# stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
#  'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
#  'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
#  'each', 'few', 'for', 'from', 'further', 
#  'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
#  'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
#  'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
#  "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
#  'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
#  'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
#  'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
#  "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
#  'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
#  "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
#  'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
#  'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
#  '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [14]:
stop_words = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]

In [15]:
# stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

#### Remove headers from the document text
- There are a couple of line breaks after header information in each file
- So, check for that and remove everything above that

In [16]:
def remove_headers(lines):
    for i, line in enumerate(lines):
        # First make sure that the bytecodes read is decoded
        line = line.decode(encoding='utf-8')
        if line == '\n':
            break
    return lines[i+1:]

#### Remove whitespaces and stop words from every line

In [17]:
def remove_digits(word):
    for i in range(10):
        word = word.replace(str(i), '')
    return word

In [18]:
def remove_punctuations(word):
    all_punctuations = punctuation.replace("'", "")
    # Also, add tabs
    all_punctuations += '\t'
    table = str.maketrans('', '', all_punctuations)
    return word.translate(table)

In [19]:
def pre_process(words):
    """
    Takes in a list of words and applies some preprocessing
    1. Remove numbers from string
    2. Remove punctuations
    3. Remove quotes from words if present
    """
    processed_words = []
    for word in words:
        # Remove numbers from words
        word = remove_digits(word)

        # Remove punctuations
        word = remove_punctuations(word)

        # Do not process empty or one character strings
        if len(word) < 2:
            continue

        # Also check for quoted words and remove the quotes
        if word[0] in ["'", '"']:
            word = word[1:]
        if word[-1] in ["'", '"']:
            word = word[:-1]
            
        processed_words.append(word)
    
    return processed_words

In [20]:
def validate_line(line):
    # Return a list of valid words
    words = line.replace('\n', '').strip().split(' ')
    words = pre_process(words)
    return words

In [21]:
def read_file(file_path):
    try:
        with open(file_path, 'rb') as file:
            lines = file.readlines()
        valid_lines = remove_headers(lines)
        valid_words = []
        for line in valid_lines:
            # Decode byte words to string on each line
            line = line.decode(encoding='utf-8')
            processed_line = validate_line(line)
            for word in processed_line:
                word = word.lower()
                if len(word) > 1 and word not in stop_words:
                    valid_words.append(word)
                    
    except Exception as error:
        # print(f'ERROR: {error} || FILE_NAME: {file_path}')
        return [], 1
    
    return valid_words, 0

In [22]:
read_file('data/20_newsgroups/alt.atheism/54238')[0][:10]

['article',
 'cvmrzdarksideosrheuoknoredu',
 'bilokcforumosrheedu',
 'conner',
 'writes',
 'myth',
 'refer',
 'convoluted',
 'counterfeit',
 'athiests']

## Selecting features for the dataset

In [23]:
def get_features(X, n_features=4000, reject_words=0):
    """Goes through the entire training set and gets top "n_features" words appeared in the documents along with their frequencies"""
    all_words = []
    file_errors = 0
    for file_path in X:
        words, has_error = read_file(file_path)
        file_errors += has_error
        for w in words:
            all_words.append(w)
            
    words, counts = np.unique(np.array(all_words), return_counts=True)
    freq, words = (list(i) for i in zip(*sorted(zip(counts, words), reverse=True)))
    # print(len(words), words[:10], freq[:10])
    # print(f'Total file encoding errors: {file_errors}')
    
    # Return the 4000 words removing the first reject_words (as they are very common and won't be useful in differentiating among the documents)
    # in the whole dataset
    return words[reject_words:n_features]

In [24]:
def doc_word_freq(X):
    """
        Returns a list of dictionaries that contain the frequencies of words in each document
        --> [{'word1': 3, ...} ...]
    """
    word_freq = []
    for file_path in X:
        words, has_error = read_file(file_path)
        words, counts = np.unique(np.array(words), return_counts=True)
        word_counts = {}
        for i, word in enumerate(words):
            word_counts[word] = counts[i]
        
        word_freq.append(word_counts)
    return word_freq

In [25]:
def create_data(X, feature_words):
    X_data = []
    word_freq = doc_word_freq(X)
    for doc_words in word_freq:
        # doc_words is a dict that contains words in that document along with their number of appearences
        doc_data = []
        for f_word in feature_words:
            if f_word in doc_words.keys():
                # Add the frequency for the word to create a feature vector for training set
                doc_data.append(doc_words[f_word])
            else:
                doc_data.append(0)
        X_data.append(doc_data)
    return np.array(X_data)

In [26]:
def get_train_test(X_train, y_train, X_test, y_test, n_features=4000, reject_words=0):
    feature_words = get_features(X_train, n_features, reject_words)
    X_train = create_data(X_train, feature_words)
    X_test = create_data(X_test, feature_words)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    print(f'Train samples: {len(X_train)} || Test samples: {len(X_test)}')
    return X_train, y_train, X_test, y_test

# Text Classification with Naive Bayes

In [27]:
class NaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.prior = None
        self._is_trained = False
    
    def fit(self, X_train, y_train):
        n = X_train.shape[0]
        
        # Separate data in X_train by its classes
        X_by_class = np.array([X_train[y_train == c] for c in np.unique(y_train)])
        self.prior = np.array([len(X_class)/n for X_class in X_by_class])
        
        # Get word counts
        self.word_counts = np.array([row.sum(axis=0) for row in X_by_class]) + self.alpha
        self.lk_word = self.word_counts / self.word_counts.sum(axis=1).reshape(-1, 1)
        
        self._is_trained = True
        return self
    
    def predict(self, X):
        return self.predict_prob(X).argmax(axis=1)
    
    def score(self, X_test, y_test):
        y_pred = self.predict_prob(X_test).argmax(axis=1)
        return np.mean(y_pred == y_test)
    
    def predict_prob(self, X):
        if not self._is_trained:
            print('Model not trained yet!!')
            
        # Go through each input vector to calculate the conditional probabilities
        class_nums = np.zeros(shape=(X.shape[0], self.prior.shape[0]))
        for i, x in enumerate(X):
            word_exists = x.astype(bool)
            lk_words_present = self.lk_word[:, word_exists] ** x[word_exists]
            lk_message = (lk_words_present).prod(axis=1)
            class_nums[i] = lk_message * self.prior
            
        normalize_term = class_nums.sum(axis=1).reshape(-1, 1)
        conditional_probs = class_nums / normalize_term
        return conditional_probs

In [31]:
def k_fold(X_paths, Y, k=5, n_features=5000, increment=1000, reject_words=0):
    for i in range(k):
        print(f'\nfold: {i} || n_features: {n_features} || reject_words: {reject_words}')
        X_train_list, y_train_list, X_test_list, y_test_list = split_train_test(X_paths, Y, test_pct=0.5)
        X_train, y_train, X_test, y_test = get_train_test(X_train_list, y_train_list, X_test_list, y_test_list, n_features, reject_words)
        alpha = random.uniform(0.5, 1.0)
        print(f'Alpha chosen: {alpha}')
        clf = NaiveBayes(alpha=alpha)
        clf.fit(X_train, y_train)
        print(f'Acc: {clf.score(X_test, y_test)}')
        #n_features += increment    

# Test some hyperparameters

In [29]:
k_fold(X_paths, Y, k=10, reject_words=900)


fold: 0 || n_features: 5000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.8131513439076901
Acc: 0.6844007227464365

fold: 1 || n_features: 6000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.8182395813367933
Acc: 0.6923308572575788

fold: 2 || n_features: 7000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.7050027565002497
Acc: 0.7020678578598675

fold: 3 || n_features: 8000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.8772637911906764
Acc: 0.6997590845211805

fold: 4 || n_features: 9000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.7751594113962157
Acc: 0.7087934149769123

fold: 5 || n_features: 10000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.8205057389889763
Acc: 0.7035735796024895

fold: 6 || n_features: 11000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen

# k-Fold Cross Validation

In [32]:
k_fold(X_paths, Y, k=10, n_features=14000, reject_words=900)


fold: 0 || n_features: 14000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.72379781297586
Acc: 0.7114033326641237

fold: 1 || n_features: 14000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.9437328497264166
Acc: 0.7073880746837984

fold: 2 || n_features: 14000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.5413693747693333
Acc: 0.7186307970287091

fold: 3 || n_features: 14000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.5612603118958358
Acc: 0.7143143946998595

fold: 4 || n_features: 14000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.5382103066097286
Acc: 0.711704477012648

fold: 5 || n_features: 14000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chosen: 0.669042708208162
Acc: 0.7115037141136318

fold: 6 || n_features: 14000 || reject_words: 900
Train samples: 9962 || Test samples: 9962
Alpha chose

# For comparison

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [35]:
X_train_list, y_train_list, X_test_list, y_test_list = split_train_test(X_paths, Y, test_pct=0.5)
X_train, y_train, X_test, y_test = get_train_test(X_train_list, y_train_list, X_test_list, y_test_list, n_features=14000, reject_words=900)

Train samples: 9962 || Test samples: 9962


In [36]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
print(clf.score(X_test, y_test))

0.782975306163421
