In [25]:
from __future__ import print_function
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
import pickle

import nbimporter
from frameworks import SelfLearningModel

In [77]:

#Remove metadata 
remove = ('headers', 'footers', 'quotes') 
RANDOM_STATE = 10

# Load dataset
print("Loading 20 newsgroups dataset")
newsdata = fetch_20newsgroups(subset='all')
len(newsdata.data)

Loading 20 newsgroups dataset


18846

In [78]:
X_remaining, X_test, Y_remaining, Y_test = train_test_split(newsdata.data, newsdata.target, test_size=0.20, random_state=RANDOM_STATE)
sss = StratifiedShuffleSplit(n_splits=1, test_size= 0.625,random_state=RANDOM_STATE)
X_remaining=np.array(X_remaining)
Y_remaining=np.array(Y_remaining)
for train_index, test_index in sss.split(X_remaining,Y_remaining):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_dev = X_remaining[train_index], X_remaining[test_index]
    y_train, y_dev = Y_remaining[train_index], Y_remaining[test_index]
    
print('Data loaded')
print()
print('Training data documents:', len(X_train))
print('Development data documents:', len(X_dev))
print('Test data documents:', len(X_test))
print()
print('Total Newsgroups :', newsdata.target_names)

TRAIN: [ 4830  5487  1131 ...  6543 10054  9924] TEST: [ 2647  7876  4857 ... 13387  2034  8962]
Data loaded

Training data documents: 5653
Development data documents: 9423
Test data documents: 3770

Total Newsgroups : ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [80]:
y_minus=np.full(len(y_dev), -1, dtype=int)

X_concat=np.concatenate((X_train, X_dev), axis=None)
y_concat=np.concatenate((y_train, y_minus), axis=None)
y_true=np.concatenate((y_train, y_unlabel), axis=None)
print (X_concat.shape, y_concat.shape, y_true.shape)

(15076L,) (15076L,) (15076L,)


In [81]:
CV = CountVectorizer(analyzer= 'word')

In [82]:
data_train=CV.fit_transform(X_train)
data_devt=CV.fit_transform(X_dev)

classifier_NB = MultinomialNB(alpha=0.01)
classifier_NB.fit(data_train, y_train)
print ("Supervised NB score", classifier_NB.score(data_dev, y_dev))

Supervised NB score 0.8449538363578478


In [84]:
X_transform = CV.fit_transform(X_concat)
X=X_transform.toarray()
X.shape

(15076L, 138572L)

In [85]:
ssmodel = SelfLearningModel(classifier_NB)
ssmodel.fit(X, y_concat)
print ("Self-learning NB. score", ssmodel.score(X, y_true))

Self-learning NB. score 0.8735738922791191
