In [114]:
# Author: Matt Terry <matt.terry@gmail.com>
#
# License: BSD 3 clause
from __future__ import print_function

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
# import codecs
# import json

# def read_dataset(path):
#   with codecs.open(path, 'r', 'utf-8') as myFile:
#     content = myFile.read()
#   dataset = json.loads(content)
#   return dataset

# if __name__ == '__main__':
#   path = 'data/pizza_request_dataset.json'
#   dataset = read_dataset(path)
  
#   print 'The dataset contains %d samples.' %(len(dataset))
#   print 'Available attributes: ', sorted(dataset[0].keys())
#   print 'First post:'
#   print json.dumps(dataset[0], sort_keys=True, indent=2)

#   successes = [r['requester_received_pizza'] for r in dataset]
#   success_rate = 100.0 * sum(successes) / float(len(successes))
#   print 'The average success rate is: %.2f%%' %(success_rate)

In [117]:
import pandas as pd
import json
import codecs
import re
import random
import numpy as np

###load JSON file
file_name = 'data/pizza_request_dataset.json'
dataset = read_dataset(file_name)
df = pd.read_json(json.dumps(dataset, sort_keys=True, indent=2))
###create random 90-10 split
X = df
rows = random.sample(X.index, int(0.9*len(X)) + 1)
X_train = X.ix[rows]
X_test = X.drop(rows)
y_train = X_train.requester_received_pizza.astype(int)
y_test = X_test.requester_received_pizza.astype(int)

# print "Data loading and train-test splits = DONE!"
###

In [119]:
###Model 1 - a) n-grams

### build pipeline; fit train; predict on test
import nltk
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm

##vectorizer arguments blah!

tokenizer=None#word_tokenize
# stop_words=nltk.corpus.stopwords.words("english")#None
ngram_range=(1, 2)
lowercase=True
max_features=500
binary=False
dtype=np.float64

###create unigram vectorizer
uniVect = CountVectorizer(decode_error="ignore",
#                                tokenizer=tokenizer,
#                                stop_words=stop_words,
                               ngram_range=(1,1),
                               lowercase=lowercase,
                               binary=binary,
                               dtype=dtype,
                               max_features=max_features)

###create bigram vect
biVect = CountVectorizer(decode_error="ignore",
#                                tokenizer=tokenizer,
#                                stop_words=stop_words,
                               ngram_range=(2,2),
                               lowercase=lowercase,
                               binary=binary,
                               dtype=dtype,
                               max_features=max_features)

# load custom features and FeatureUnion with Vectorizer
features = []
features.append(('unigram', uniVect))
features.append(('bigram', biVect))
all_features = FeatureUnion(features)

linear_svc = svm.SVC(kernel='linear', probability=True)

###create pipeline
text_clf = Pipeline([
                     ('all', all_features),
#                      ('tfidf', TfidfTransformer()),
                    ('clf', linear_svc),
                    ])

# text_clf = Pipeline([
#                      ('vect', vectorizer),
# #                      ('tfidf', TfidfTransformer()),
#                     ('clf', linear_svc),
#                     ])

###fit training data
text_clf = text_clf.fit(X_train.request_text_edit_aware, y_train)

### predict on test data
predicted = text_clf.predict(X_test.request_text_edit_aware)
np.mean(predicted == y_test)
# print "NO!"
probas = text_clf.predict_proba(X_test.request_text_edit_aware)
# print "YO!"
###get performance metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
accuracy = accuracy_score(y_test,predicted)
#         accuracy = np.mean(y_CVtest == predicted)
precision, recall, fscore, sup = precision_recall_fscore_support(y_test, predicted, average='binary', pos_label=0)
print precision, recall, fscore, accuracy# np.mean(predicted == y_test)
from sklearn.metrics import roc_auc_score
# roc_auc = roc_auc_score(y_test, predicted)
roc_auc = roc_auc_score(y_test, probas[:,1])
print "ROC = ", roc_auc

SyntaxError: invalid syntax (<ipython-input-119-9ad50ecf7942>, line 74)

ROC =  0.548414105595


In [None]:
# Activity and Reputation features

###Model 1 - a) n-grams

### build pipeline; fit train; predict on test
import nltk
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC


###create pipeline
text_clf = Pipeline([
                     ('vect', vectorizer),
                    ('clf', LinearSVC(C=0.1)),
                    ])

###fit training data
text_clf = text_clf.fit(X_train.request_text_edit_aware, y_train)

### predict on test data
predicted = text_clf.predict(X_test.request_text_edit_aware)
np.mean(predicted == y_test)

In [47]:
from sklearn.base import BaseEstimator, TransformerMixin

class ActivityExtractor(BaseEstimator, TransformerMixin):

    def __init__(self, vars):
        self.vars = vars  # e.g. pass in a column name to extract

    def transform(self, X, y=None):
#         return do_something_to(X, self.vars)  # where the actual feature extraction happens
        return X[self.vars]
    def fit(self, X, y=None):
        return self  # generally does nothing

# class SampleExtractor(BaseEstimator, TransformerMixin):

#     def __init__(self, vars):
#         self.vars = vars  # e.g. pass in a column name to extract

#     def transform(self, X, y=None):
#         return do_something_to(X, self.vars)  # where the actual feature extraction happens

#     def fit(self, X, y=None):
#         return self  # generally does nothing

from sklearn.pipeline import Pipeline, FeatureUnion
pipeline = Pipeline([
    ('feats', FeatureUnion([
        ('act', ActivityExtractor("requester_subreddits_at_request")), # can pass in either a pipeline
#         ('rep', ReputationExtractor()) # or a transformer
    ])),
    ('clf', LinearSVC(C=0.1))  # classifier
])

###fit training data
text_clf = text_clf.fit(X_train, y_train)

### predict on test data
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

ValueError: Found arrays with inconsistent numbers of samples: [  33 5104]

In [45]:
pipeline

Pipeline(steps=[('feats', FeatureUnion(n_jobs=1,
       transformer_list=[('act', ActivityExtractor(vars='requester_subreddits_at_request'))],
       transformer_weights=None)), ('clf', LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [112]:
# Author: Matt Terry <matt.terry@gmail.com>
#
# License: BSD 3 clause
from __future__ import print_function

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    
# ('selector', ItemSelector(key='subject'))

from sklearn.pipeline import Pipeline, FeatureUnion
# pipeline = Pipeline([
#     ('feats', FeatureUnion([
#         ('downvotes', ItemSelector("number_of_downvotes_of_request_at_retrieval")), # can pass in either a pipeline
#         ('upvotes', ItemSelector("number_of_upvotes_of_request_at_retrieval")) # or a transformer
#     ])),
#     ('clf', LinearSVC(C=0.1))  # classifier
# ])
pipeline = Pipeline([
    ('downvotes', ItemSelector("number_of_downvotes_of_request_at_retrieval")),
    ('clf', LinearSVC(C=0.1)),  # classifier
])

# post_was_edited
# requester_account_age_in_days_at_request

###fit training data
pipeline.fit(X_train, y_train)

### predict on test data
predicted = pipeline.predict(X_test)
np.mean(predicted == y_test)



ValueError: Found arrays with inconsistent numbers of samples: [   1 5105]

In [89]:
ds = ItemSelector(key='number_of_downvotes_of_request_at_retrieval')
ds.transform(data)

TypeError: list indices must be integers, not str

In [94]:
y_train = pd.Da

2596    0
5266    0
4544    0
5061    0
1578    1
410     0
644     0
2779    0
3695    0
2618    0
1949    0
4684    0
2855    0
1735    1
417     0
...
156     1
5277    0
5044    0
1742    1
2203    0
4528    0
3503    0
2696    0
1633    1
3188    0
3902    0
4683    0
3609    0
3699    0
1676    1
Name: requester_received_pizza, Length: 5104, dtype: int64

In [111]:
X_train

Unnamed: 0,giver_username_if_known,in_test_set,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,requester_account_age_in_days_at_retrieval,requester_days_since_first_post_on_raop_at_request,requester_days_since_first_post_on_raop_at_retrieval,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,requester_number_of_comments_in_raop_at_request,requester_number_of_comments_in_raop_at_retrieval,requester_number_of_posts_at_request,requester_number_of_posts_at_retrieval,Unnamed: 21
2596.0,,False,0,7,0,t3_jf027,9,A pizza and hopefully a story about your life ...,A pizza and hopefully a story about your life ...,"[Request] Hi RAOP, I do not have a financial c...",143.496586,991.951250,64.735069,913.189734,33,39,3,6,2,4,...
5266.0,,True,1,5,1380483094,t3_1nbrku,5,I wasn't sure if this was considered a contest...,I wasn't sure if this was considered a contest...,[Request] (USA) Thought I'd go about this a bi...,479.848322,548.397627,280.757454,349.306759,969,990,4,5,20,22,...
4544.0,,True,7,12,0,t3_11s3p3,0,"I just got home from working a 15 hour shift, ...","I just got home from working a 15 hour shift, ...",[Request] Yuba City California,210.859850,623.121875,0.000000,412.220359,107,112,0,0,18,22,...
5061.0,,True,1,1,0,t3_iypmh,0,"I am currently at work, surrounded by people w...","I am currently at work, surrounded by people w...",[REQUEST] Hardworking family could use a pizza...,0.000000,865.303681,0.000000,865.303681,0,0,0,0,0,1,...
1578.0,,False,3,6,0,t3_s71nl,3,Biochemistry is a hell of a major.,Biochemistry is a hell of a major.,[REQUEST] 2 broke university students in Ottaw...,25.329248,627.684803,0.000000,602.313889,3,79,0,7,0,8,...
410.0,,False,0,4,0,t3_k7vh5,4,"Before making this post, I did read through se...","Before making this post, I did read through se...",[Request] Pepperoni pizza (GA),2.979225,823.536007,0.000000,820.515116,1,64,0,24,0,1,...
644.0,,False,1,2,0,t3_1mjf8o,0,Hope I get a response!,Hope I get a response!,[Request] Came home from work tired and discov...,518.985995,599.312650,0.000000,80.284988,69,69,0,0,9,10,...
2779.0,,False,7,5,0,t3_1nehkl,3,It's not about the food. I have dinner already...,It's not about the food. I have dinner already...,"[REQUEST] USA Illinois, just a kid who wants t...",315.675752,383.070220,0.000000,67.061134,26,27,0,0,50,54,...
3695.0,,False,3,4,0,t3_1996pd,0,Hey there guys I was just flicking through ran...,Hey there guys I was just flicking through ran...,"[REQUEST] Starving student from Dunedin, NZ!",366.230394,649.081227,0.000000,282.850833,889,995,0,0,257,307,...
2618.0,,False,1,9,0,t3_opnwu,0,My girlfriend and I live together and usually ...,My girlfriend and I live together and usually ...,(Request) Broke College Student down on luck. ...,325.177396,1010.455278,0.000000,685.277882,91,241,0,0,4,25,...
