In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.optimize as opt
import sklearn.linear_model
import sklearn.model_selection

Load the dataset

In [3]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
text, label = data['data'][0], data['target_names'][data['target'][0]]

print(list(data)) #see list of attributes in this class.
print('---')
print(list(data.target_names)) # list 20 class under this dataset.
print('---')
print(len(data['target'])) #no of rows = 11314

['data', 'filenames', 'target_names', 'target', 'DESCR']
---
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
---
11314


Use `HashingVectorizer` to encode the text into sparse features:

In [4]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=5000, binary=True)
features = vectorizer.fit_transform(data['data'])
targets = data['target']
newfeatures = features.todense()
print(targets) #todense return a matrix.

[7 4 4 ... 3 1 8]


In [16]:
# features have a dimension of 11314 samples * 5000 features
print(newfeatures.shape)
print(targets.shape)
print(type(newfeatures))
print(type(targets))

(11314, 5000)
(11314,)
<class 'numpy.matrix'>
<class 'numpy.ndarray'>


Use the K-Fold cross-validation to split the dataset into training and test parts:

In [12]:
Kfolddata = sklearn.model_selection.KFold(n_splits=10)

Experiment with different models (L1, L2, ...)

In [13]:
scores = []
classification_model = sklearn.linear_model.SGDClassifier(loss='log', penalty='l1')
for train_index, test_index in Kfolddata.split(newfeatures):
    print(newfeatures.shape, ' ' , targets.shape)
    X_train, X_test, y_train, y_test = newfeatures[train_index], newfeatures[test_index], targets[train_index], targets[test_index]
    classification_model.fit(X_train, y_train)
    scores.append(classification_model.score(X_test, y_test))

print(np.mean(scores)) 
#100features, 100 folds: 0.19037571805620246
#1000 features, 10 folds : 0.6530836715374306

(11314, 5000)   (11314,)


KeyboardInterrupt: 

In [21]:
#Simpler methods
classification_modelL1 = sklearn.linear_model.SGDClassifier(loss='squared_loss', penalty='l1', alpha=0.0001)
sklearn.model_selection.cross_val_score(classification_modelL1, newfeatures, targn_jobs=-1)
mean(avg_score)
#5000 features, 5 folds, 0.0001 alpha : 0.7206110055547025
#1000 features, 5 folds, 0.001 alpha : 0.2530836715374306
#1000 features, 5 folds, 0.00001 alpha :0.4664129796483858

In [66]:
from statistics import mean
classification_modelL2 = sklearn.linear_model.SGDClassifier(loss='log', penalty='l2', alpha=0.00001)
avg_score = sklearn.model_selection.cross_val_score(classification_modelL2, newfeatures, targets, cv=5)
mean(avg_score)
#5000 features, 5 folds, 0.0001 alpha : 0.8168636814194283
#1000 features, 5 folds, 0.00001 alpha : 0.7180491690998038
#5000 features, 5 folds, 0.00001 alpha : 0.8568143271238033

0.8568143271238033

In [67]:
from statistics import mean
classification_modelL2 = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', alpha=0.00001)
avg_score = sklearn.model_selection.cross_val_score(classification_modelL2, newfeatures, targets, cv=5)
mean(avg_score)
#5000 features, 5 folds, 0.0001 alpha : 0.801838400627009
#1000 features, 5 folds, 0.00001 alpha : 0.7206110055547025
#5000 features, 5 folds, 0.00001 alpha : 0.8538985478537797

0.8538985478537797

What model worked best?

L2 and elasticnet works similar, L1 is significantly weaker