In [1]:
from os import listdir
from os.path import isfile, join
import glob
import string

## Input data

In [2]:
path = '20_newsgroup/20news-bydate/'
trainPath = path+'20news-bydate-train'
testPath = path+'20news-bydate-test'

In [3]:
trainFolders = glob.glob(trainPath+'/*')
testFolders = glob.glob(testPath+'/*')

In [4]:
textClass = []
trainClassArticles = {}
testClassArticles = {}
for p in trainFolders:
    textClass.append(p.split('/')[-1])
    trainClassArticles[p.split('/')[-1]] = glob.glob(p+'/*')

for p in testFolders:
#    textClass.append(p.split('/')[-1])
    testClassArticles[p.split('/')[-1]] = glob.glob(p+'/*')
    

## Word Preprocessing and tokenization -- Example

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re

#### Read text

In [6]:
path = '20_newsgroup/20news-bydate/20news-bydate-train/talk.politics.mideast/76248'
fin = open(path, encoding="latin1")
text = fin.read()

#### 1. Convert text to lower case

In [7]:
text = text.lower()

#### 2. Convert numeric numbers into token NUM

In [8]:
text = re.sub(r' \d+ ', 'NUM', text)

#### 3. Remove punctuations, newline and tab

In [9]:
punctuations = (string.punctuation).replace("'", "") ## Keep "'" in text as removing it may change the meaning of the word completely
text = text.translate(str.maketrans("", "", punctuations)).strip()
text = text.replace('\n', '').replace('\t', '')

#### Word Tokenization

In [10]:
tokenized_word = word_tokenize(text)

#### Remove stop words

In [11]:
stop_words = stopwords.words('english')
tokenized_word = [word for word in tokenized_word if not word in stop_words]

#### Word Stemmization

In [12]:
stemmer= PorterStemmer()
stemmed_word = [stemmer.stem(word) for word in tokenized_word]

## Prepare data for training/testing set, and encode document label

In [22]:
trainData, testData = [], []
trainLabel, testLabel = [], []

In [23]:
def dataPreparation(articles, data, label):
    for k, v in articles.items():
        for path in v:
            with open(path, encoding="latin1") as fin:
                textLine = fin.read()
            data.append(textLine)
            label.append(k)

In [24]:
dataPreparation(trainClassArticles, trainData, trainLabel)

In [25]:
dataPreparation(testClassArticles, testData, testLabel)

In [28]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
encodedTrainLabel = le.fit_transform(trainLabel)
encodedTestLabel = le.transform(testLabel)

## Shuffle training data 

In [30]:
from random import shuffle

In [31]:
shuffleArr = list(zip(trainData, encodedTrainLabel))

In [32]:
shuffle(shuffleArr)

In [33]:
trainData, encodedTrainLabel = zip(*shuffleArr)

## Model training

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [43]:
from sklearn.linear_model import RidgeClassifier, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
#from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [40]:
tfidfVectorizer = TfidfVectorizer(encoding='latin1')
countVectorizer = CountVectorizer(encoding='latin1')

In [44]:
lrc = RidgeClassifier()
svc = LinearSVC()
mnb = MultinomialNB()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

In [38]:
trial1 = Pipeline([ ('vectorizer', vectorizer), ('classifier', lrc)])

In [39]:
trial1.fit(trainData, encodedTrainLabel)



Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='latin1', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pat...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

Training/Testing Data Feature Extraction

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

In [13]:
vectorizer = TfidfVectorizer(encoding='latin1', stop_words='english')
le = preprocessing.LabelEncoder()

In [14]:
x_train = vectorizer.fit_transform(doc for doc in trainData)

In [15]:
y_train = le.fit_transform(trainLabel)

In [16]:
x_test = vectorizer.transform(doc for doc in testData)

In [17]:
y_test = le.transform(testLabel)

In [18]:
x_train.shape

(11314, 133065)

In [19]:
len(y_train)

11314

Shuffle training data

In [20]:
from random import shuffle

In [21]:
shuffleArr = list(zip(x_train.toarray(), y_train))

In [22]:
shuffle(shuffleArr)

In [23]:
x_train, y_train = zip(*shuffleArr)

Text Classification

In [24]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

Benchmarking with SGDClassifier

In [294]:
sgd = SGDClassifier()

In [296]:
sgd.fit(x_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [297]:
sgd.score(x_train, y_train)

0.9928407283012197

In [298]:
sgd.score(x_test, y_test)

0.8385554965480616

In [299]:
pc = Perceptron()

In [305]:
pc.fit(x_train, y_train)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [301]:
pc.score(x_train, y_train)

0.9994696835778681

In [302]:
pc.score(x_test, y_test)

0.8174455655868296

In [303]:
pac = PassiveAggressiveClassifier()

In [304]:
pac.fit(x_train, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [306]:
pac.score(x_train, y_train)

0.999734841788934

In [307]:
pac.score(x_test, y_test)

0.8385554965480616

In [308]:
mnb = MultinomialNB()

In [309]:
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [310]:
mnb.score(x_train, y_train)

0.9516528195156443

In [311]:
mnb.score(x_test, y_test)

0.8167817312798725

In [255]:
rc = RidgeClassifier()

In [280]:
rc.fit(x_train, y_train)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [281]:
rc.score(x_train, y_train)

0.9969948736079194

In [282]:
rc.score(x_test, y_test)

0.8450610727562401

In [283]:
from sklearn.ensemble import RandomForestClassifier

In [284]:
rfc = RandomForestClassifier()

In [285]:
rfc.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [286]:
rfc.score(x_train, y_train)

0.9981438925225384

In [287]:
rfc.score(x_test, y_test)

0.6367498672331386

In [288]:
from sklearn.svm import LinearSVC

In [289]:
svc = LinearSVC()

In [290]:
svc.fit(x_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [291]:
svc.score(x_train, y_train)

0.9992045253668022

In [292]:
svc.score(x_test, y_test)

0.8458576739245884