In [27]:
import os
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2, SelectKBest
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import math
import re
import enchant
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

In [11]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display
    
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [12]:
# vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
ch2 = SelectKBest(chi2, k=1000)
# categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
#               'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
#               'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
#               'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
#               'talk.politics.misc', 'talk.religion.misc']
initial_categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
              'comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.electronics', 'soc.religion.christian', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
comp_class = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
                   'comp.sys.mac.hardware', 'comp.windows.x', 'sci.electronics']
religion_class = ['alt.atheism', 'talk.religion.misc', 'soc.religion.christian']
sport_class = ['rec.sport.baseball', 'rec.sport.hockey']
auto_class = ['rec.autos', 'rec.motorcycles']
polit_class = ['talk.politics.mideast', 'talk.politics.misc']
target_classes_lbls = [religion_class, comp_class, sport_class, auto_class, polit_class]
eng_dict = enchant.Dict('en_US')

In [13]:
ng_train = fetch_20newsgroups(subset='train', categories=initial_categories,
                              remove=('headers', 'footers', 'quotes')
                             )
print ng_train.keys()

['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']


In [14]:
print ng_train['target_names']

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.electronics', 'soc.religion.christian', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [15]:
data_prep = []
for el in ng_train.data:
    filtered = re.findall(u'(?u)\\b\\w\\w+\\b', el)
    filtered = [filt for filt in filtered if eng_dict.check(filt)]
    el_prep = ' '.join(filtered)
    data_prep.append(el_prep)

In [16]:
# vectors_train = vectorizer.fit_transform(ng_train.data)
vectors_train = vectorizer.fit_transform(data_prep)
vectors_train

<8401x31912 sparse matrix of type '<type 'numpy.float64'>'
	with 446852 stored elements in Compressed Sparse Row format>

In [17]:
svd = TruncatedSVD(n_components=15, random_state=10)

In [18]:
X = vectors_train.copy()
y = ng_train['target'].copy()
for i_el, el in enumerate(y):
    cat_name = ng_train['target_names'][el]
    for i_c, cl in enumerate(target_classes_lbls):
        if cat_name in cl:
            y[i_el] = i_c
            break
lbls = ['religion', 'computers', 'sport', 'auto-moto', 'politics']

In [19]:
X_svd = svd.fit_transform(X, y)

In [21]:
X_svd.shape

(8401, 15)

In [31]:
phi = svd.components_
phi.shape

(15, 31912)

In [45]:
for i_t, topic in enumerate(phi, start=1):
    ind = np.argsort(topic)
    print 'Hiden category #{}: {}'.format(i_t, np.asarray(vectorizer.get_feature_names())[ind[-6:-1]][:])
#     break

Hiden category #1: [u'people' u'does' u'don' u'just' u'like']
Hiden category #2: [u'drive' u'dos' u'file' u'card' u'thanks']
Hiden category #3: [u'season' u'players' u'games' u'year' u'game']
Hiden category #4: [u'bus' u'controller' u'card' u'drives' u'scsi']
Hiden category #5: [u'info' u'advance' u'does' u'know' u'mail']
Hiden category #6: [u'games' u'windows' u'team' u'card' u'game']
Hiden category #7: [u'cards' u'bus' u'monitor' u'drivers' u'video']
Hiden category #8: [u'motif' u'like' u'just' u'use' u'car']
Hiden category #9: [u'server' u'armenian' u'motif' u'israel' u'scsi']
Hiden category #10: [u'new' u'armenian' u'good' u'jesus' u'god']
Hiden category #11: [u'mouse' u'problem' u'thanks' u'drive' u'windows']
Hiden category #12: [u'dos' u'mail' u'com' u'use' u'windows']
Hiden category #13: [u'armenian' u'window' u'know' u'does' u'car']
Hiden category #14: [u'use' u'jews' u'israeli' u'car' u'does']
Hiden category #15: [u'file' u'color' u'thanks' u'monitor' u'scsi']


In [23]:
rf = RandomForestClassifier(n_estimators=500, random_state=10)

In [28]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X_svd, y, test_size=0.3, random_state=10
)

In [29]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [30]:
print np.mean(rf.predict(X_train) != y_train)
print np.mean(rf.predict(X_test) != y_test)

0.0181972789116
0.181277270924
