In [1]:
import os
import random

In [2]:
import numpy as np
import pandas as pd

In [3]:
data_dir = 'data/20_newsgroups/'

In [4]:
def get_targets(data_dir):
    # Assign target values to each of the classes in the dataset
    targets = {}
    for i, newsgroup in enumerate(os.listdir(data_dir)):
        targets[newsgroup] = i
    return targets

#### Assigning a target value to each document class

In [5]:
targets_dict = get_targets(data_dir)
targets_dict

{'alt.atheism': 0,
 'rec.autos': 1,
 'comp.windows.x': 2,
 'sci.med': 3,
 'sci.crypt': 4,
 'comp.os.ms-windows.misc': 5,
 'talk.politics.mideast': 6,
 'talk.politics.misc': 7,
 'sci.electronics': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'comp.graphics': 11,
 'sci.space': 12,
 'talk.politics.guns': 13,
 'comp.sys.mac.hardware': 14,
 'misc.forsale': 15,
 'talk.religion.misc': 16,
 'rec.motorcycles': 17,
 'comp.sys.ibm.pc.hardware': 18,
 'soc.religion.christian': 19}

In [6]:
def get_data_paths(data_dir):
    X_paths, Y = [], []
    targets_dict = get_targets(data_dir)
    for newsgroup_dir in os.listdir(data_dir):
        class_path = os.path.join(data_dir, newsgroup_dir)
        for text_file in os.listdir(class_path):
            X_paths.append(os.path.join(class_path, text_file))
            Y.append(targets_dict.get(newsgroup_dir))
            
    return X_paths, Y
    

In [7]:
X_paths, Y = get_data_paths(data_dir)

In [10]:
print(f'Total data samples: {len(Y)}')

Total data samples: 19997


#### Randomly checking if the data is correct or not

In [11]:
random.sample(X_paths, 5)

['data/20_newsgroups/talk.politics.misc/178869',
 'data/20_newsgroups/rec.sport.hockey/54132',
 'data/20_newsgroups/misc.forsale/75994',
 'data/20_newsgroups/sci.crypt/16067',
 'data/20_newsgroups/sci.crypt/15217']

In [12]:
random.sample(Y, 5)

[11, 12, 18, 19, 5]

In [29]:
def split_train_test(X, y, test_pct=0.5):
    total_len = len(y)
    train_len = int(test_pct*total_len)
    train_indices = random.sample(range(total_len), train_len)
    test_indices = [k for k in range(total_len) if k not in train_indices]
    X_train, y_train, X_test, y_test = [], [], [], []
    for i in train_indices:
        X_train.append(X[i])
        y_train.append(y[i])
        
    for i in test_indices:
        X_test.append(X[i])
        y_test.append(y[i])
    
    return X_train, y_train, X_test, y_test

In [28]:
X_train, y_train, X_test, y_test = split_train_test(X_paths, Y, test_pct=0.5)

9998
[15460, 10377, 1494, 6048, 14801, 12374, 4360, 6435, 15024, 986]
9999


In [18]:
print(f'Training samples: {len(y_train)} || Testing samples: {len(y_test)}')

Training samples: 9998 || Testing samples: 12073
