In [43]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import GridSearchCV

In [2]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [61]:
tfidf = TfidfVectorizer(use_idf=True)
X = newsgroups.data
y = newsgroups.target

In [62]:
data = tfidf.fit_transform(X)
feature_mapping = tfidf.get_feature_names()
print (data)

  (0, 12328)	0.01370453937738971
  (0, 3083)	0.18354582904899466
  (0, 16212)	0.13087071621284083
  (0, 26748)	0.3586414018510131
  (0, 10446)	0.0877895428145851
  (0, 5173)	0.14700744118874529
  (0, 25602)	0.11312702126571896
  (0, 9436)	0.1603221752648857
  (0, 6206)	0.18354582904899466
  (0, 24745)	0.01370453937738971
  (0, 21441)	0.031971150014803995
  (0, 15606)	0.17973817843104967
  (0, 22911)	0.10589006137266588
  (0, 24461)	0.08966035046275328
  (0, 20381)	0.0657737589879958
  (0, 19110)	0.01430815397323633
  (0, 8823)	0.08966035046275328
  (0, 9768)	0.09732962706472326
  (0, 16346)	0.01372756577212755
  (0, 1668)	0.053216416571889205
  (0, 14361)	0.04446778160006921
  (0, 4890)	0.021242159802601628
  (0, 1191)	0.10682888989503368
  (0, 12512)	0.059332575743481546
  (0, 6741)	0.09587314565925223
  :	:
  (1785, 8616)	0.0961857077737838
  (1785, 11782)	0.05563580903402357
  (1785, 10058)	0.0742115230560897
  (1785, 970)	0.04188477714101907
  (1785, 16405)	0.055955356753644464
  (

In [52]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(data, newsgroups.target)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [63]:
svc = gs.best_estimator_

In [65]:
svc.fit(data, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [86]:
top10idx = np.array(svc.coef_.indices)[np.abs(np.array(svc.coef_.data)).argsort()[-10:]]

In [89]:
a = []
for i in top10idx:
    a.append(feature_mapping[i].lower())
a.sort()
for i in a:
    print(i, end=',')

atheism,atheists,bible,god,keith,moon,religion,sci,sky,space,