In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 
import re

In [2]:
from os import listdir
from os.path import isfile, join
import glob
import string
import numpy as np

Input data

In [3]:
path = '20_newsgroup/20news-bydate/'
trainPath = path+'20news-bydate-train'
testPath = path+'20news-bydate-test'

In [4]:
trainFolders = glob.glob(trainPath+'/*')
testFolders = glob.glob(testPath+'/*')

In [5]:
textClass = []
trainClassArticles = {}
testClassArticles = {}
for p in trainFolders:
    textClass.append(p.split('/')[-1])
    trainClassArticles[p.split('/')[-1]] = glob.glob(p+'/*')

for p in testFolders:
#    textClass.append(p.split('/')[-1])
    testClassArticles[p.split('/')[-1]] = glob.glob(p+'/*')
    

Word Preprocessing and Tokenization

In [6]:
# path = '20_newsgroup/20news-bydate/20news-bydate-train/talk.politics.mideast/76248'
# fin = open(path, encoding="latin1")
# text = fin.read()

In [7]:
# punctuations = (string.punctuation).replace("'", "") 
# text2 = text.replace('\n', '').replace('\t', '').lower()
# text2 = re.sub(r' \d+ ', 'NUM', text2)
# text2 = text2.translate(str.maketrans("", "", punctuations)).strip()

In [8]:
# tokenized_word = word_tokenize(text2)

In [9]:
# stop_words = stopwords.words('english')

In [10]:
# tokenized_word = [word for word in tokenized_word if not word in stop_words]

In [11]:
# stemmer= PorterStemmer()

In [12]:
# stemmed_word = [stemmer.stem(word) for word in tokenized_word]

In [13]:
# trainData, testData = [], []
# trainLabel, testLabel = [], []
trainData, testData = {}, {}
trainLabel, testLabel = [], []

In [14]:
punctuations = (string.punctuation).replace("'", "") 
stop_words = stopwords.words('english')
stemmer= PorterStemmer()
def wordPreprocess(articles, data, label):
    
    data['tokenized'] = []
    data['stop_word'] = []
    data['stemmed'] = []
    
    for k, v in articles.items():
        for path in v:

            doc = []
            with open(path, encoding="latin1") as fin:
                text = fin.read()
                ## change to lowercase
                lowered = text.replace('\n', ' ').replace('\t', ' ').lower()
                ## replace any numerical token with NUM
                numsub = re.sub(r' \d+ ', 'NUM', lowered)
                ## remove all punctuation except "'" because it may change the meaning
                nopunc = numsub.translate(str.maketrans("", "", punctuations)).strip()
                ## tokenize words
                tokenized_word = word_tokenize(nopunc)
                ## collect data for tokenized words
                data['tokenized'].append(' '.join(tokenized_word))
                ## remove all stop words
                stop_word = [word for word in tokenized_word if not word in stop_words]
                ## collect tokenized words after removing stop words
                data['stop_word'].append(' '.join(stop_word))
                ## word stemming
                stemmed_word = [stemmer.stem(word) for word in stop_word]
                ## collect tokenized words after removing stop words and stemming
                data['stemmed'].append(' '.join(stemmed_word))
                ##data.append(' '.join(stemmed_word))
                label.append(k)

In [15]:
wordPreprocess(trainClassArticles, trainData, trainLabel)

In [16]:
wordPreprocess(testClassArticles, testData, testLabel)

#### Training/Testing Data Feature Extraction

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

In [18]:
vectorizer = TfidfVectorizer(encoding='latin1', stop_words='english')
le = preprocessing.LabelEncoder()

In [19]:
x_train, x_test = {}, {}
y_train, y_test = {}, {}

In [20]:
x_train['tokenized'] = vectorizer.fit_transform(doc for doc in trainData['tokenized'])
x_test['tokenized'] = vectorizer.transform(doc for doc in testData['tokenized'])

x_train['stop_word'] = vectorizer.fit_transform(doc for doc in trainData['stop_word'])
x_test['stop_word'] = vectorizer.transform(doc for doc in testData['stop_word'])

x_train['stemmed'] = vectorizer.fit_transform(doc for doc in trainData['stemmed'])
x_test['stemmed'] = vectorizer.transform(doc for doc in testData['stemmed'])

In [21]:
y_train['tokenized'] = le.fit_transform(trainLabel)
y_test['tokenized'] = le.transform(testLabel)
y_train['stop_word'] = le.fit_transform(trainLabel)
y_test['stop_word'] = le.transform(testLabel)
y_train['stemmed'] = le.fit_transform(trainLabel)
y_test['stemmed'] = le.transform(testLabel)

In [22]:
# x_train = vectorizer.fit_transform(doc for doc in trainData)
# x_test = vectorizer.transform(doc for doc in testData)
# y_train = le.fit_transform(trainLabel)
# y_test = le.transform(testLabel)

#### train with tokenized words (without removing stop words and stemming)

Shuffle training data

In [23]:
from random import shuffle

In [24]:
def shuffleTrainData(x_train, y_train):
    shuffleArr = list(zip(x_train.toarray(), y_train))
    shuffle(shuffleArr)
    return zip(*shuffleArr)

In [25]:
x_train['tokenized'], y_train['tokenized'] = shuffleTrainData(x_train['tokenized'], y_train['tokenized'])
x_train['stop_word'], y_train['stop_word'] = shuffleTrainData(x_train['stop_word'], y_train['stop_word'])
x_train['stemmed'], y_train['stemmed'] = shuffleTrainData(x_train['stemmed'], y_train['stemmed'])
#x_train, y_train = shuffleTrainData(x_train, y_train)

Text Classification

In [26]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

Benchmarking with Ridge Classifier

Compare modeling performance on different training sets -- tokenized, removing stop-words, and after stemming

In [68]:
rc = RidgeClassifier()
rc2 = RidgeClassifier()
rc3 = RidgeClassifier()

In [69]:
rc.fit(x_train['tokenized'], y_train['tokenized'])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [70]:
rc.score(x_train['tokenized'], y_train['tokenized'])

0.9974368039596959

In [71]:
rc.score(x_test['tokenized'], y_test['tokenized'])

0.8505045140732873

In [72]:
rc2.fit(x_train['stop_word'], y_train['stop_word'])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [73]:
rc2.score(x_train['stop_word'], y_train['stop_word'])

0.9974368039596959

In [74]:
rc2.score(x_test['stop_word'], y_test['stop_word'])

0.8503717472118959

In [75]:
rc3.fit(x_train['stemmed'], y_train['stemmed'])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [76]:
rc3.score(x_train['stemmed'], y_train['stemmed'])

0.9973484178893406

In [77]:
rc3.score(x_test['stemmed'], y_test['stemmed'])

0.8522304832713755

There is not much difference by just removing stop-words, however, prediction on test set is slightly better after applying stemming. 

#### Try different model on 'stemmed' dataset

In [20]:
pc = Perceptron()

In [21]:
pc.fit(x_train['stemmed'], y_train['stemmed'])

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [22]:
pc.score(x_train['stemmed'], y_train['stemmed'])

0.9993812975075128

In [23]:
pc.score(x_test['stemmed'], y_test['stemmed'])

0.8198353690918747

In [20]:
mnb = MultinomialNB()

In [21]:
mnb.fit(x_train['stemmed'], y_train['stemmed'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
mnb.score(x_train['stemmed'], y_train['stemmed'])

0.9505921866713806

In [23]:
mnb.score(x_test['stemmed'], y_test['stemmed'])

0.8131970260223048

In [24]:
rfc = RandomForestClassifier()

In [25]:
rfc.fit(x_train['stemmed'], y_train['stemmed'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
rfc.score(x_train['stemmed'], y_train['stemmed'])

0.9976135761004066

In [27]:
rfc.score(x_test['stemmed'], y_test['stemmed'])

0.6107275624004248

In [29]:
svc = LinearSVC()

In [30]:
svc.fit(x_train['stemmed'], y_train['stemmed'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [31]:
svc.score(x_train['stemmed'], y_train['stemmed'])

0.9992045253668022

In [32]:
svc.score(x_test['stemmed'], y_test['stemmed'])

0.8506372809346787