In [1]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
estimators = [('reduce_dim', PCA()), ('clf', SVC())]
pipe = Pipeline(estimators)

In [2]:
pipe

Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [6]:
pipe 


Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Binarizer
make_pipeline(Binarizer(), MultinomialNB()) 

Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [5]:
pipe

Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [7]:
# Author: Matt Terry <matt.terry@gmail.com>
#
# License: BSD 3 clause
from __future__ import print_function

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]


class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
    """Extract the subject & body from a usenet post in a single pass.

    Takes a sequence of strings and produces a dict of sequences.  Keys are
    `subject` and `body`.
    """
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('subject', object), ('body', object)])
        for i, text in enumerate(posts):
            headers, _, bod = text.partition('\n\n')
            bod = strip_newsgroup_footer(bod)
            bod = strip_newsgroup_quoting(bod)
            features['body'][i] = bod

            prefix = 'Subject:'
            sub = ''
            for line in headers.split('\n'):
                if line.startswith(prefix):
                    sub = line[len(prefix):]
                    break
            features['subject'][i] = sub

        return features


pipeline = Pipeline([
    # Extract the subject & body
    ('subjectbody', SubjectBodyExtractor()),

    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the post's subject line
            ('subject', Pipeline([
                ('selector', ItemSelector(key='subject')),
                ('tfidf', TfidfVectorizer(min_df=50)),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('body_bow', Pipeline([
                ('selector', ItemSelector(key='body')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=50)),
            ])),

            # Pipeline for pulling ad hoc features from post's body
            ('body_stats', Pipeline([
                ('selector', ItemSelector(key='body')),
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

        # weight components in FeatureUnion
        transformer_weights={
            'subject': 0.8,
            'body_bow': 0.5,
            'body_stats': 1.0,
        },
    )),

    # Use a SVC classifier on the combined features
    ('svc', SVC(kernel='linear')),
])

# limit the list of categories to make running this example faster.
categories = ['alt.atheism', 'talk.religion.misc']
train = fetch_20newsgroups(random_state=1,
                           subset='train',
                           categories=categories,
                           )
test = fetch_20newsgroups(random_state=1,
                          subset='test',
                          categories=categories,
                          )

pipeline.fit(train.data, train.target)
y = pipeline.predict(test.data)
print(classification_report(y, test.target))

KeyboardInterrupt: 

In [8]:
import pandas as pd

In [10]:
placeNum = str(786)
statistical_picklePath = "../code/data/"+placeNum+"/"+placeNum+"_mpframe3.p"
mpframe3 = pd.read_pickle(statistical_picklePath)

In [11]:
mpframe3.head(2)

Unnamed: 0,date_device_id,logs,traj,ts,dwell_time,hour_start,time_start,ts_end,hour_end,time_end,...,18,19,20,21,22,23,24,25,26,27
0,16675_017699a4395352e941f6ed271f5fd1cd,"[4547961, 4546624, 4546613, 4546612]","[out, out, in, 1f]","[1440741504, 1440747786, 1440747820, 1440747820]","[0, 34, 0, 0]","[14, 16, 16, 16]","[14:58:24, 16:43:06, 16:43:40, 16:43:40]","[1440741504, 1440747820, 1440747820, 1440747820]","[14, 16, 16, 16]","[14:58:24, 16:43:40, 16:43:40, 16:43:40]",...,,inf,,0,0,0,0,1,0,0
1,16675_02614c7588f7f8eaa0d3b9047ac08410,"[4545933, 4545741, 4545737, 4545720, 4545716, ...","[out, in, 1f, 1f-right, 2f, 2f-left, 2f-right,...","[1440750511, 1440751250, 1440751260, 144075126...","[857, 118, 42, 108, 27, 20, 8, 0, 0]","[17, 17, 17, 17, 17, 17, 17, 17, 17]","[17:28:31, 17:40:50, 17:41:00, 17:41:00, 17:41...","[1440751368, 1440751368, 1440751302, 144075136...","[17, 17, 17, 17, 17, 17, 17, 17, 17]","[17:42:48, 17:42:48, 17:41:42, 17:42:48, 17:42...",...,0.0,0.0,0.0,0,0,0,0,1,0,0


In [12]:
mpframe3.sample(frac=0.1)

Unnamed: 0,date_device_id,logs,traj,ts,dwell_time,hour_start,time_start,ts_end,hour_end,time_end,...,18,19,20,21,22,23,24,25,26,27
90597,17051_04231782fdb61c01818b330fb604b81c,"[366875, 366775, 366711]","[out, in, 1f, 1f-right]","[1473245260, 1473245629, 1473245926, 1473245926]","[983, 297, 0, 0]","[19, 19, 19, 19]","[19:47:40, 19:53:49, 19:58:46, 19:58:46]","[1473246243, 1473245926, 1473245926, 1473245926]","[20, 19, 19, 19]","[20:04:03, 19:58:46, 19:58:46, 19:58:46]",...,,inf,0.0,0,0,1,0,0,0,0
2353,16685_19f9058380240eeaa6dc8a65d7260017,"[4449649, 4449579]","[out, in]","[1441616649, 1441616842]","[195, 0]","[18, 18]","[18:04:09, 18:07:22]","[1441616844, 1441616842]","[18, 18]","[18:07:24, 18:07:22]",...,inf,inf,,1,0,0,0,0,0,0
11600,16723_3927a8c16dc280d85ca73058e4cc3860,"[4068437, 4068436, 4068435]","[out, in, 1f]","[1444898495, 1444898495, 1444898495]","[44, 0, 0]","[17, 17, 17]","[17:41:35, 17:41:35, 17:41:35]","[1444898539, 1444898495, 1444898495]","[17, 17, 17]","[17:42:19, 17:41:35, 17:41:35]",...,,inf,,0,0,0,1,0,0,0
37379,16826_2bc80935bb4e77abe03738546ef12471,"[2909012, 2909005, 2908988]","[out, in, 1f]","[1453797228, 1453797234, 1453797255]","[47, 20, 0]","[17, 17, 17]","[17:33:48, 17:33:54, 17:34:15]","[1453797275, 1453797254, 1453797255]","[17, 17, 17]","[17:34:35, 17:34:14, 17:34:15]",...,,inf,0.0,0,1,0,0,0,0,0
46014,16864_e986782efd38316b298f34a724f5c214,"[2488573, 2488569, 2488568, 2488557, 2488531, ...","[out, in, 1f, 1f-right, 2f, 2f-inner, 2f-right...","[1457079267, 1457079282, 1457079282, 145707928...","[471, 383, 0, 0, 257, 0, 166, 0]","[17, 17, 17, 17, 17, 17, 17, 17]","[17:14:27, 17:14:42, 17:14:42, 17:14:42, 17:15...","[1457079738, 1457079665, 1457079282, 145707928...","[17, 17, 17, 17, 17, 17, 17, 17]","[17:22:18, 17:21:05, 17:14:42, 17:14:42, 17:19...",...,,0.000000,0.0,0,0,0,0,1,0,0
37457,16826_5628418990fc38984567bfbe29456c39,"[2908939, 2908897, 2908896]","[out, in, 1f]","[1453797470, 1453797645, 1453797645]","[189, 0, 0]","[17, 17, 17]","[17:37:50, 17:40:45, 17:40:45]","[1453797659, 1453797645, 1453797645]","[17, 17, 17]","[17:40:59, 17:40:45, 17:40:45]",...,,inf,,0,1,0,0,0,0,0
87302,17036_b0798af2d227e294fe5962d84334e788,"[536076, 535967, 535966]","[out, in, 1f, 1f-left]","[1471937493, 1471937815, 1471937815, 1471937815]","[384, 0, 0, 0]","[16, 16, 16, 16]","[16:31:33, 16:36:55, 16:36:55, 16:36:55]","[1471937877, 1471937815, 1471937815, 1471937815]","[16, 16, 16, 16]","[16:37:57, 16:36:55, 16:36:55, 16:36:55]",...,,inf,,0,1,0,0,0,0,0
93151,17063_63103e8b5dad6c070b9c077a50f8c50c,"[248980, 248961, 248960, 248958]","[out, in, 2f, 2f-left, 1f-right]","[1474280235, 1474280311, 1474280311, 147428031...","[103, 4, 0, 0, 0]","[19, 19, 19, 19, 19]","[19:17:15, 19:18:31, 19:18:31, 19:18:31, 19:18...","[1474280338, 1474280315, 1474280311, 147428031...","[19, 19, 19, 19, 19]","[19:18:58, 19:18:35, 19:18:31, 19:18:31, 19:18...",...,inf,,0.0,1,0,0,0,0,0,0
96361,17075_8e81148ffd17545cea5b9d6074cfb136,"[108329, 108266, 108265, 106878]","[out, in, 1f, out]","[1475310250, 1475310468, 1475310468, 1475315267]","[219, 0, 0, 0]","[17, 17, 17, 18]","[17:24:10, 17:27:48, 17:27:48, 18:47:47]","[1475310469, 1475310468, 1475310468, 1475315267]","[17, 17, 17, 18]","[17:27:49, 17:27:48, 17:27:48, 18:47:47]",...,,inf,,0,0,0,0,0,1,0
60914,16922_c720239dc1557e5366874bf022b7dc98,"[1809379, 1809320, 1809319]","[out, in, 1f]","[1462080382, 1462080608, 1462080608]","[238, 12, 12]","[14, 14, 14]","[14:26:22, 14:30:08, 14:30:08]","[1462080620, 1462080620, 1462080620]","[14, 14, 14]","[14:30:20, 14:30:20, 14:30:20]",...,0.000000,inf,0.0,0,0,0,0,0,0,1


In [43]:
kf.split(X, X['revisit_intention'])

<generator object _BaseKFold.split at 0x119d81678>

In [41]:
from sklearn.model_selection import KFold, StratifiedKFold
kf = StratifiedKFold(n_splits=10)
X = mpframe3
kf.get_n_splits(X, X['revisit_intention'])
# kf = KFold(mpframe3, n_splits=10) 
# for train_index, test_index in kf: X_train, X_test = mpframe3.ix[train_index], mpframe3.ix[test_index]

for train_index, test_index in kf.split(X, X['revisit_intention']):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.ix[train_index]
    X_test = X.ix[test_index]
    
    print(X_train['revisit_intention'].value_counts())
    
    print(X_test['revisit_intention'].value_counts())
    
    

0.0    57795
1.0    31298
Name: revisit_intention, dtype: int64
0.0    6422
1.0    3478
Name: revisit_intention, dtype: int64
0.0    57795
1.0    31298
Name: revisit_intention, dtype: int64
0.0    6422
1.0    3478
Name: revisit_intention, dtype: int64
0.0    57795
1.0    31298
Name: revisit_intention, dtype: int64
0.0    6422
1.0    3478
Name: revisit_intention, dtype: int64
0.0    57795
1.0    31298
Name: revisit_intention, dtype: int64
0.0    6422
1.0    3478
Name: revisit_intention, dtype: int64
0.0    57795
1.0    31298
Name: revisit_intention, dtype: int64
0.0    6422
1.0    3478
Name: revisit_intention, dtype: int64
0.0    57795
1.0    31298
Name: revisit_intention, dtype: int64
0.0    6422
1.0    3478
Name: revisit_intention, dtype: int64
0.0    57795
1.0    31299
Name: revisit_intention, dtype: int64
0.0    6422
1.0    3477
Name: revisit_intention, dtype: int64
0.0    57796
1.0    31299
Name: revisit_intention, dtype: int64
0.0    6421
1.0    3477
Name: revisit_intention, dtype