# 20 News Groups

Preprocessing: loading the dataset and vectorizing in word counts (using sickit learn).

In [2]:
import numpy as np
import scipy.io
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.utils.validation import check_array

## Direct use of fetch_20newsgroups_vectorized

The following code is based on the sickit learn test function: scikit-learn/benchmarks/bench_20newsgroups.py

See help page for the fetch_20newsgroups_vectorized function [here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups_vectorized.html?highlight=fetch_20newsgroups_vectorized).

In [3]:
data_train = fetch_20newsgroups_vectorized(subset="train")
data_test = fetch_20newsgroups_vectorized(subset="test")

In [4]:
#Check if non-empty 2D array containing only finite values.
X_train = check_array(data_train.data, dtype=np.float,
                      accept_sparse="csc")
X_test = check_array(data_test.data, dtype=np.float, accept_sparse="csr")

y_train = data_train.target
y_test = data_test.target


In [5]:
print("20 newsgroups")
print("=============")
print("X_train.shape = {0}".format(X_train.shape))
print("X_train.format = {0}".format(X_train.format))
print("X_train.dtype = {0}".format(X_train.dtype))
print("X_train density = {0}"
      "".format(X_train.nnz / np.product(X_train.shape)))
print("y_train {0}".format(y_train.shape))
print("X_test {0}".format(X_test.shape))
print("X_test.format = {0}".format(X_test.format))
print("X_test.dtype = {0}".format(X_test.dtype))
print("y_test {0}".format(y_test.shape))
#print()

20 newsgroups
X_train.shape = (11314, 130107)
X_train.format = csc
X_train.dtype = float64
X_train density = 0.001214353154362896
y_train (11314,)
X_test (7532, 130107)
X_test.format = csr
X_test.dtype = float64
y_test (7532,)


In [6]:
X_train.data

array([0.19050019, 0.05025189, 0.02277438, ..., 0.08111071, 0.03889549,
       0.05679618])

In [9]:
y_train[:30]

array([17,  7, 10, 10,  7,  0, 12, 15,  9,  0, 11,  1,  3,  5,  2,  1,  7,
       14,  1, 13,  1, 10, 10, 14,  3,  0, 10,  6, 19,  2])

#### Save to MATLAB

In [55]:
scipy.io.savemat('20newsgroups_train.mat', dict(A=X_train, y=y_train))

## From GAP conf paper (comp.graphics vs. talk.religion.misc TF-IDF)

The following code is based on the sickit learn function: scikit-learn/benchmarks/bench_20newsgroups.py

It implements the simulation set-up used in the Fercoq et al. 2015 paper "Mind the duality gap".

See section 4.3. pg. 8:
> "(...) dataset obtained with bag of words features extracted from the 20newsgroup dataset (comp.graphics vs. talk.religion.misc with TF-IDF removing English stop words and words occurring only once or more than 95% of the time). Text feature extraction was done using Scikit-Learn"

In [63]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer

### 2 classes

In [64]:
# 20 news groups
#def load_news():
data = datasets.fetch_20newsgroups(categories=['comp.graphics',
                                               'talk.religion.misc'])
vect = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') #Vectorizer

X = vect.fit_transform(data.data)
X = X.astype(np.float)

y = data.target.astype(np.float)
y[y == 0] = -1.

#return X, y

# Leukemia
# def load_leukemia():
#     data = datasets.fetch_mldata('leukemia')
#     X = data.data
#     y = data.target
#     X = X.astype(float)
#     y = y.astype(float)
#     y /= linalg.norm(y)
#     return X, y


Dictionary X is (document vs. word).
Input vector y contains the class (+1 or -1 value)

In [47]:
print("20 newsgroups")
print("=============")
print("X.shape = {0}".format(X.shape))
print("X.format = {0}".format(X.format))
print("X.dtype = {0}".format(X.dtype))
print("X density = {0}"
      "".format(X.nnz / np.product(X.shape)))
print("y.shape {0}".format(y.shape))
print("y.dtype = {0}".format(y.dtype))

20 newsgroups
X.shape = (961, 10094)
X.format = csr
X.dtype = float64
X density = 0.009014019517266107
y.shape (961,)
y.dtype = float64


In [38]:
X.data

array([1, 1, 1, ..., 2, 1, 1])

#### Save to Matlab

In [65]:
scipy.io.savemat('20newsgroups_Tfidf.mat', dict(A=X, y=y, words=vect.get_feature_names()))

## COUNT DATA : My custom vectorization


### 2 classes, docs vs. words

In [123]:
from sklearn.feature_extraction.text import CountVectorizer

In [126]:
data = datasets.fetch_20newsgroups(categories=['comp.graphics',
                                               'talk.religion.misc'])
vect = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vect.fit_transform(data.data)
X = X.astype(np.float)

In [127]:
print("20 newsgroups")
print("=============")
print("X.shape = {0}".format(X.shape))
print("X.format = {0}".format(X.format))
print("X.dtype = {0}".format(X.dtype))
print("X density = {0}"
      "".format(X.nnz / np.product(X.shape)))
print("y {0}".format(y.shape))

20 newsgroups
X.shape = (961, 10094)
X.format = csr
X.dtype = float64
X density = 0.009014019517266107
y (961,)


In [74]:
print(X.toarray())
X.data
#print(vect.get_feature_names())

[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


array([1., 1., 1., ..., 2., 1., 1.])

### Save to Matlab

In [93]:
scipy.io.savemat('20newsgroups_Count_2classes.mat', dict(X=X, words=vect.get_feature_names()))

## My custom vectorization (less sparse)

### 2 classes, docs vs. words

In [88]:
data = datasets.fetch_20newsgroups(categories=['comp.graphics',
                                               'talk.religion.misc'])
vect = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
X = vect.fit_transform(data.data)
X = X.astype(np.float)

In [89]:
print("20 newsgroups")
print("=============")
print("X.shape = {0}".format(X.shape))
print("X.format = {0}".format(X.format))
print("X.dtype = {0}".format(X.dtype))
print("X density = {0}"
      "".format(X.nnz / np.product(X.shape)))
print("y {0}".format(y.shape))

20 newsgroups
X.shape = (961, 4140)
X.format = csr
X.dtype = float64
X density = 0.018039783438145652
y (961,)


#### Save to Matlab

In [None]:
scipy.io.savemat('20newsgroups_Count_2classes_reduced.mat', dict(X=X, words=vect.get_feature_names()))

### All classes, words vs. docs

> **THIS IS THE ONE THAT IS USED IN EXPERIMENTS!!!**

In [151]:
data = datasets.fetch_20newsgroups(subset="all") #(subset="all") to get test and train subsets
vect = CountVectorizer(max_df=0.95, min_df=0.054, stop_words='english') #min_df=0.076 for 100 words
X = vect.fit_transform(data.data)
X = X.astype(np.float)

In [152]:
print("20 newsgroups")
print("=============")
print("X.shape = {0}".format(X.shape))
print("X.format = {0}".format(X.format))
print("X.dtype = {0}".format(X.dtype))
print("X density = {0}"
      "".format(X.nnz / np.product(X.shape)))
print("y {0}".format(y.shape))

20 newsgroups
X.shape = (18846, 204)
X.format = csr
X.dtype = float64
X density = 0.10848455905762495
y (961,)


In [117]:
print(vect.get_feature_names())

['10', '12', '14', '15', '16', '1993', '20', 'actually', 'article', 'believe', 'best', 'better', 'bit', 'ca', 'case', 'com', 'come', 'computer', 'course', 'cs', 'david', 'day', 'did', 'didn', 'different', 'distribution', 'does', 'doesn', 'don', 'edu', 'end', 'fact', 'far', 'going', 'good', 'got', 'great', 'group', 'help', 'host', 'information', 'just', 'keywords', 'know', 'let', 'like', 'little', 'll', 'long', 'look', 'lot', 'mail', 'make', 'need', 'new', 'news', 'nntp', 'number', 'old', 'people', 'point', 'possible', 'post', 'posting', 'probably', 'problem', 'question', 'read', 'real', 'really', 'reply', 'right', 'said', 'say', 'says', 'science', 'software', 'state', 'sure', 'tell', 'thanks', 'thing', 'things', 'think', 'time', 'true', 'try', 'university', 'usa', 'use', 'used', 'using', 've', 'version', 'want', 'way', 'work', 'world', 'writes', 'year', 'years']


### Save to Matlab

In [144]:
scipy.io.savemat('20newsgroups_Count_100words.mat', dict(X=X, words=vect.get_feature_names()))