In [1]:
import os
import json
from csv import DictReader, DictWriter

import numpy as np
from numpy import array

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import accuracy_score

SEED = 5

In [74]:
'''
The ItemSelector class was created by Matt Terry to help with using
Feature Unions on Heterogeneous Data Sources

All credit goes to Matt Terry for the ItemSelector class below

For more information:
http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html
'''
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        #print ("SelectorFit", x['text'][:1][:10])
        print ("SelectorFit")
        return self

    def transform(self, data_dict):
        print (data_dict.keys())
        return data_dict[self.key]



In [75]:

"""
This is an example of a custom feature transformer. The constructor is used
to store the state (e.g like if you need to store certain words/vocab), the
fit method is used to update the state based on the training data, and the
transform method is used to transform the data into the new feature(s). In
this example, we simply use the length of the movie review as a feature. This
requires no state, so the constructor and fit method do nothing.
"""
class TextLengthTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, examples):
        print ("TextLengthFit", len(examples[0]))
        return self

    def transform(self, examples):
        features = np.zeros((len(examples), 1))
        i = 0
        for ex in examples:
            features[i, 0] = len(ex)
            i += 1

        return features



In [76]:
# TODO: Add custom feature transformers for the movie review data
class CountTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer =  CountVectorizer()
        self.x_train = None

    def fit(self, examples):
        print(examples[:1])
        self.tranformer = self.vectorizer.fit(examples)
        print ( "Count train",self.x_train[0] )
        return self

    def transform(self, examples):
        features = None
        features = self.transformer.transform(examples)
        #print (features[0])
        return features
    

In [77]:
# TODO: Add custom feature transformers for the movie review data
class TfidfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tfidf_vectorizer =  TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
        self.tranformer = None

    def fit(self, examples):
        print("tfidf fit", examples[:1])
        self.transformer = self.tfidf_vectorizer.fit(examples)
        return self

    def transform(self, examples):
        print("tfidf transform", examples[:1])
        features = None
        features = self.transformer.transform(examples)
        print (features[0])
        return features
    

In [78]:
class Featurizer:
    def __init__(self):
        # To add new features, just add a new pipeline to the feature union
        # The ItemSelector is used to select certain pieces of the input data
        # In this case, we are selecting the plaintext of the input data

        # TODO: Add any new feature transformers or other features to the FeatureUnion
        self.all_features = FeatureUnion([
            ('text_stats', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('text_length', TextLengthTransformer())
            ]))
            ,
            ('text_stats2', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('tfidf', TfidfTransformer())
            ])),

        ])

    def train_feature(self, examples):
        return self.all_features.fit_transform(examples)

    def test_feature(self, examples):
        return self.all_features.transform(examples)

In [79]:

# Read in data

dataset_x = []
dataset_y = []

with open('../data/movie_review_data.json') as f:
    data = json.load(f)
    for d in data['data']:
        dataset_x.append(d['text'])
        dataset_y.append(d['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(dataset_x, dataset_y, test_size=0.3, random_state=SEED)


In [80]:
X_train[0]

'note : some may consider portions of the following text to be spoilers . be forewarned .  " all the world\'s a stage and all the men and women merely players they have their exits and their entrances and one man in his time plays many parts " - excerpt from as you like it , act ii , scene 7 when william shakespeare penned this passage , he could not have possibly envisioned a world in which the domestic activites in an abode would be broadcast across the continent , or where women would install webcams in their apartments in order to convert voyeurism into cash . this is the world of today , and it is the perfect climate to unveil a prototypical high-concept project like the truman show . truman burbank ( jim carrey ) seems to have the perfect life . he has a pretty , doting wife meryl ( laura linney ) , a comfortable insurance sales position , an immaculate suburban home in the idyllic island community of seahaven , a reliable childhood buddy marlon ( noah emmerich ) -- except for th

In [81]:

feat = Featurizer()

labels = []
for l in y_train:
    if not l in labels:
        labels.append(l)

print("Label set: %s\n" % str(labels))

# Here we collect the train features
# The inner dictionary contains certain pieces of the input data that we
# would like to be able to select with the ItemSelector
# The text key refers to the plaintext
feat_train = feat.train_feature({
    'text': [t for t in X_train]
})
# Here we collect the test features
feat_test = feat.test_feature({
    'text': [t for t in X_test]
})

print(feat_train)
print(set(y_train))



Label set: [1, 0]

SelectorFit
dict_keys(['text'])
TextLengthFit 7257
SelectorFit
dict_keys(['text'])
tfidf fit ['note : some may consider portions of the following text to be spoilers . be forewarned .  " all the world\'s a stage and all the men and women merely players they have their exits and their entrances and one man in his time plays many parts " - excerpt from as you like it , act ii , scene 7 when william shakespeare penned this passage , he could not have possibly envisioned a world in which the domestic activites in an abode would be broadcast across the continent , or where women would install webcams in their apartments in order to convert voyeurism into cash . this is the world of today , and it is the perfect climate to unveil a prototypical high-concept project like the truman show . truman burbank ( jim carrey ) seems to have the perfect life . he has a pretty , doting wife meryl ( laura linney ) , a comfortable insurance sales position , an immaculate suburban home i

In [82]:

# Train classifier
#lr = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, max_iter=15000, shuffle=True, verbose=2)
lr = SGDClassifier(loss='log', penalty='l2', alpha=0.01, max_iter=100, shuffle=True, verbose=2)

lr.fit(feat_train, y_train)
y_pred = lr.predict(feat_train)
accuracy = accuracy_score(y_pred, y_train)
print("Accuracy on training set =", accuracy)
y_pred = lr.predict(feat_test)
accuracy = accuracy_score(y_pred, y_test)
print("Accuracy on test set =", accuracy)

# EXTRA CREDIT: Replace the following code with scikit-learn cross validation

-- Epoch 1
Norm: 218.87, NNZs: 25217, Bias: -0.144211, T: 1400, Avg. loss: 1287714.812042
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 100.14, NNZs: 30378, Bias: -0.166552, T: 2800, Avg. loss: 222468.445218
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 56.11, NNZs: 32307, Bias: -0.178242, T: 4200, Avg. loss: 127309.596050
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 50.14, NNZs: 33128, Bias: -0.184788, T: 5600, Avg. loss: 88911.818882
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 36.19, NNZs: 33506, Bias: -0.192887, T: 7000, Avg. loss: 66764.622624
Total training time: 0.02 seconds.
-- Epoch 6
Norm: 23.68, NNZs: 33745, Bias: -0.197856, T: 8400, Avg. loss: 53592.746203
Total training time: 0.02 seconds.
-- Epoch 7
Norm: 15.41, NNZs: 33854, Bias: -0.203774, T: 9800, Avg. loss: 51355.720944
Total training time: 0.03 seconds.
-- Epoch 8
Norm: 2.81, NNZs: 33897, Bias: -0.206870, T: 11200, Avg. loss: 41088.726185
Total training time: 0.03 seconds.
-- Epoch 9
Norm: 2