In [12]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [13]:
newsgroups = datasets.fetch_20newsgroups(
    subset='all', 
    categories=['alt.atheism', 'sci.space']
    )

In [25]:
corpus, cls = newsgroups.data, newsgroups.target

In [26]:
cls[:10]

array([0, 0, 1, 1, 1, 1, 0, 1, 0, 1])

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [28]:
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")

In [29]:
vectorizer = TfidfVectorizer()

In [30]:
X = vectorizer.fit_transform(corpus)
y = cls

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=241,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [31]:
gs.best_params_

{'C': 1.0}

In [32]:
gs.best_score_

0.9932810750279956

In [34]:
svm_c = SVC(C=1.0, kernel='linear', random_state=241)
svm_c.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=241,
  shrinking=True, tol=0.001, verbose=False)

In [35]:
svm_c.coef_

<1x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 18404 stored elements in Compressed Sparse Row format>

In [49]:
print(svm_c.coef_[0, 1000:1010])

  (0, 0)	0.004134168873656052
  (0, 4)	-0.02046670419861847
  (0, 9)	-0.015531537656736176
  (0, 1)	-0.11088005179895934
  (0, 3)	-0.019641185437769923


In [52]:
vectorizer.get_feature_names()[10000]

'do'

In [53]:
feature_mapping = vectorizer.get_feature_names()

In [59]:
np.sort(np.abs(svm_c.coef_.toarray()[0]))[-10:]

array([1.02930693, 1.09709365, 1.13061234, 1.13908084, 1.1801316 ,
       1.20161118, 1.24918001, 1.25468995, 1.9203794 , 2.66316479])

In [76]:
svm_c.coef_.toarray()[0]

array([ 0.29258057, -0.12314757,  0.        , ...,  0.01972862,
        0.05831336, -0.00297347])

In [77]:
svm_c.coef_.todense()[0]

matrix([[ 0.29258057, -0.12314757,  0.        , ...,  0.01972862,
          0.05831336, -0.00297347]])

In [79]:
indexes = np.argsort(np.abs(svm_c.coef_.toarray()[0]))[-10:]

In [83]:
words = [feature_mapping[i] for i in indexes]

In [84]:
sorted(words)

['atheism',
 'atheists',
 'bible',
 'god',
 'keith',
 'moon',
 'religion',
 'sci',
 'sky',
 'space']

In [85]:
print(' '.join(sorted(words)))

atheism atheists bible god keith moon religion sci sky space
