In [1]:
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
import pandas as pd

In [2]:
cd ../data

/Users/Alvin/Repos/amazon_review_summarizer/data


### Polarity Data

In [3]:
df = pd.read_csv('labeled_random_reviews.csv')

In [4]:
# df = df[['Rating', 'TextBlob', 'VADER', 'Afinn', 'Label']]
df = df[['Rating', 'TextBlob', 'Afinn', 'Label']]

In [5]:
from sklearn.cross_validation import train_test_split

y = df.pop('Label')
X = df.values

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
clf = tree.DecisionTreeClassifier(max_depth=3)
forest = RandomForestClassifier(max_depth=3, n_estimators=100)

In [7]:
clf.fit(X_train, y_train)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
y_pred = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)

y_pred2 = forest.predict(X_test)
print metrics.accuracy_score(y_test, y_pred2)

0.692
0.72


In [9]:
clf2 = tree.DecisionTreeClassifier(max_depth=3)
clf2.fit(X, y)
y_pred = clf2.predict(X)
print metrics.accuracy_score(y, y_pred)

0.714


### My decision tree-like predictor for polarity

In [10]:
def polarity_class(rating, blob, afinn):
        '''
        INPUT: str, int
        OUTPUT: None

        Args:
            aspect: aspect dictionary to use
            review: review to score polarity on

        Function that determines polarity of a review.

        Assigns positive/negative polarity class to polarity according to the
        following custom rules:

            pol_blob = polarity score from TextBlob
            pol_afin = polarity score from Afinn

            POS:      rating == 5 AND pol_blob > 0.1
                   OR rating == 4 AND pol_blob > 0.45
                   OR rating == 4 AND pol_blob > 0.2 and pol_afin >= 4
                   OR rating == 3 AND pol_blob > 0.7
            NEG:      rating == 1 AND pol_blob < 0
                   OR rating == 1 AND pol_blob <= 0.2 AND pol_afin < 0
                   OR rating == 2 AND pol_blob < 0
                   OR rating == 2 AND pol_blob <= 0.175 AND pol_afin < 0
                   OR rating == 3 AND pol_blob < 0
            MIXED:    all other cases

        Adds tuple of (review_txt, aspect_idx, rating, pol_blob) to
        self.aspect_pol_list object.
        '''
        pol_blob = blob
        pol_afin = afinn

        if rating == 5 and pol_blob > 0.1:
            result = 1
        elif rating == 4 and pol_blob > 0.45:
            result = 1
        elif rating == 4 and pol_blob > 0.2:
            result = 1 if pol_afin >= 4 else 0
        elif rating == 3 and pol_blob > 0.7:
            result = 1
        elif rating == 3 and pol_blob < 0:
            result = -1
        elif rating == 2 and pol_blob < 0:
            result = -1
        elif rating == 2 and pol_blob <= 0.175:
            result = -1 if pol_afin < 0 else 0
        elif rating == 1 and pol_blob < 0:
            result = -1
        elif rating == 1 and pol_blob <= 0.2:
            result = -1 if pol_afin < 0 else 0
        else:
            result = 0

        return result

In [11]:
df['mymodel'] = df.apply(lambda x: polarity_class(x.Rating, x.TextBlob, x.Afinn), axis=1)

In [12]:
print metrics.accuracy_score(y, df.mymodel.values)

0.723


### Unigram Data

In [13]:
df2 = pd.read_csv('labeled_unigrams.csv')
# df2 = df2[['amod_pct', 'avg_polar_word_pct', 'is_aspect']]
df2 = df2[['amod_pct', 'is_aspect']]

In [14]:
y = df2.pop('is_aspect')
X = df2.values

In [15]:
clf3 = tree.DecisionTreeClassifier(max_depth=1)
clf3.fit(X, y)
y_pred = clf3.predict(X)

print metrics.accuracy_score(y, y_pred)
print metrics.f1_score(y, y_pred)

0.739423076923
0.783026421137


In [16]:
y_pred = X >= 0.094
print metrics.accuracy_score(y, y_pred)
print metrics.f1_score(y, y_pred)

0.739423076923
0.783026421137


### Bigram Data

In [17]:
df3 = pd.read_csv('labeled_bigrams.csv')
df3 = df3[['Avg_Dist', 'PMI', 'is_aspect']]
df3 = df3.dropna()

In [18]:
y = df3.pop('is_aspect')
X = df3.values

In [19]:
clf4 = tree.DecisionTreeClassifier(max_depth=2)
clf4.fit(X, y)
y_pred = clf4.predict(X)

print metrics.accuracy_score(y, y_pred)
print metrics.f1_score(y, y_pred)

0.834388185654
0.763197586727


In [20]:
y_pred = (X[:,0] < 2) & (X[:,1] >= 0.0003)
print metrics.accuracy_score(y, y_pred)
print metrics.f1_score(y, y_pred)

0.824894514768
0.761494252874


In [21]:
y_pred = (X[:,0] < 1.565) & (X[:,1] >= 0.0003)
print metrics.accuracy_score(y, y_pred)
print metrics.f1_score(y, y_pred)

0.830168776371
0.754946727549


### Export Tree

In [22]:
cols = ['rating', 'blob', 'afinn']

with open('sentiment_tree.dot', 'w') as dotfile:
    tree.export_graphviz(
        clf2,
        dotfile,
        feature_names=cols)

In [23]:
cols = ['avg_dist', 'pmi']

with open('bigram_tree.dot', 'w') as dotfile:
    tree.export_graphviz(
        clf4,
        dotfile,
        feature_names=cols)