In [26]:
from __future__ import print_function
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
import pickle
from nltk import word_tokenize
from sklearn import linear_model, neural_network

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import nbimporter
from frameworks import SelfLearningModel, SKTSVM
from methods import evaluate_and_plot

In [2]:
#Remove metadata 
remove = ('headers', 'footers', 'quotes') 
RANDOM_STATE = 10

# Load dataset
print("Loading 20 newsgroups dataset")
newsdata = fetch_20newsgroups(subset='all')
len(newsdata.data)

Loading 20 newsgroups dataset


18846

In [3]:
X_remaining, X_test, Y_remaining, Y_test = train_test_split(newsdata.data, newsdata.target, test_size=0.20, random_state=RANDOM_STATE)
sss = StratifiedShuffleSplit(n_splits=1, test_size= 0.625,random_state=RANDOM_STATE)
X_remaining=np.array(X_remaining)
Y_remaining=np.array(Y_remaining)
for train_index, test_index in sss.split(X_remaining,Y_remaining):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_dev = X_remaining[train_index], X_remaining[test_index]
    y_train, y_dev = Y_remaining[train_index], Y_remaining[test_index]
    
print('Data loaded')
print()
print('Training data documents:', len(X_train))
print('Development data documents:', len(X_dev))
print('Test data documents:', len(X_test))
print()
print('Total Newsgroups :', newsdata.target_names)

TRAIN: [ 4830  5487  1131 ...  6543 10054  9924] TEST: [ 2647  7876  4857 ... 13387  2034  8962]
Data loaded

Training data documents: 5653
Development data documents: 9423
Test data documents: 3770

Total Newsgroups : ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:

print ("Saving data")
np.save('data/train_data.npy', X_train)
np.save('data/dev_data.npy', X_dev)
np.save('data/test_data.npy', X_test)

np.save('data/train_label.npy', y_train)
np.save('data/dev_label.npy', y_dev)
np.save('data/test_label.npy', Y_test)

print ("Data saved!")

Saving data
Data saved!


In [4]:
y_minus=np.full(len(y_dev), -1, dtype=int)

X_concat=np.concatenate((X_train, X_dev), axis=None)
y_concat=np.concatenate((y_train, y_minus), axis=None)
y_true=np.concatenate((y_train, y_dev), axis=None)
print (X_concat.shape, y_concat.shape, y_true.shape)

(15076L,) (15076L,) (15076L,)


In [7]:
print ("Saving data")
np.save('data/X_concat.npy', X_concat)
np.save('data/y_concat.npy', y_concat)
np.save('data/y_true.npy',y_true)
print ("Data saved!")

Saving data
Data saved!


In [5]:
def Stem_tokenize(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

In [6]:
Vect = TfidfVectorizer(analyzer= 'word', tokenizer=Stem_tokenize,
                                stop_words=stopwords.words('english') + list(string.punctuation),
                                lowercase=True, strip_accents='ascii', ngram_range=(1,2),
                                min_df=5, max_df= 0.75)

In [17]:
data_train=Vect.fit_transform(X_concat)
classifier_NB = MultinomialNB(alpha=0.01)
classifier_NB.fit(data_train[:len(X_train)], y_true[:len(y_train)])


MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [18]:
print ("self-learning NB. score", classifier_NB.score(data_train[len(X_train):], y_dev))

self-learning NB. score 0.8805051469807916


In [25]:
basemodel = linear_model.SGDClassifier(loss='hinge', penalty='l1') # scikit logistic regression
basemodel.fit(data_train[:len(X_train)], y_true[:len(y_train)])
print ("supervised SVM. score", basemodel.score(data_train[len(X_train):], y_dev))

supervised SVM. score 0.8109943754642895


In [30]:
basemodel_NN = neural_network.MLPClassifier(hidden_layer_sizes=6) # scikit logistic regression
basemodel_NN.fit(data_train[:len(X_train)], y_true[:len(y_train)])
print ("supervised NN. score", basemodel_NN.score(data_train[len(X_train):], y_dev))

supervised NN. score 0.821182213732357


In [24]:
with open('model/NB_Vect.pkl', 'wb') as fid:
    pickle.dump(classifier_NB, fid)

In [32]:
X_transform =Vect.fit_transform(X_concat)
X=X_transform.toarray()
X.shape

(15076L, 100553L)

In [25]:
np.save('data/X_vect.npy',X)

In [None]:
ssmodel = SelfLearningModel(basemodel_NN)
ssmodel.fit(X, y_concat)
print ("Self-learning NB. score", ssmodel.score(X, y_true))

In [None]:
kernel = "rbf"
lbl =  "S3VM"
model = SKTSVM(kernel=kernel)
model.fit(X, y_concat)
evaluate_and_plot(model, X, y_concat, y_true, lbl, 2)