In [2]:
from sklearn import datasets
import sklearn.svm
import sklearn.model_selection
import numpy as np
import sklearn.feature_extraction

In [3]:
# import dataset
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

In [4]:
# extracting features and target
features = newsgroups.data
target = newsgroups.target
transformer = sklearn.feature_extraction.text.TfidfVectorizer()
features = transformer.fit_transform(features)

In [5]:
# parameter selection
grid = {'C': np.power(10.0, np.arange(-5, 6))}
fold = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=241)
clf = sklearn.svm.SVC(kernel='linear', random_state=241)
gs = sklearn.model_selection.GridSearchCV(estimator=clf, param_grid=grid, scoring='accuracy', cv=fold)
gs.fit(X=features, y=target)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [23]:
# find C with max score
cv_values = gs.cv_results_['mean_test_score']
C_array = gs.cv_results_['param_C']
max_index = cv_values.argmax()
C = C_array[max_index]
C

1.0

In [22]:
# train SVM with found C on entire data
clf = sklearn.svm.SVC(C=1.0, kernel='linear', random_state=241)
clf.fit(features, target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [125]:
# array with coefficients of trained SVM and their absolute values
coefs = clf.coef_.toarray()
coefs = abs(coefs[0])

In [126]:
# find indicies of ten largest coefs
indexes = []
for i in range(0,10):
    ind = coefs.argmax()
    coefs[ind] = -np.inf
    indexes.append(ind)

In [127]:
# extract words from indexes
voc = transformer.get_feature_names()
vip_words = []
for index in indexes:
    vip_words.append(voc[index])
vip_words.sort()
# answer for question
print(vip_words)

['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion', 'sci', 'sky', 'space']
