In [3]:
# Another workbook since the other one was slowing down.
# load some prereq libraries
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TKAgg')
from matplotlib import pyplot as plt
from sklearn import cross_validation

In [23]:
from stop_words import get_stop_words
import collections
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import word2vec
from lda.lda import LDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import zero_one_loss
from pybrain.structure import FeedForwardNetwork
from pybrain.structure import RecurrentNetwork
from sklearn.svm import SVC
from pybrain.structure import LinearLayer, SigmoidLayer
from sklearn.metrics.pairwise import chi2_kernel
from pybrain.structure import FullConnection
from pybrain.datasets import ClassificationDataSet
from pybrain.utilities           import percentError
from pybrain.tools.shortcuts     import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules   import SoftmaxLayer

In [4]:
############################### Library Functions ###########################################

In [5]:
def process_text(e):
    raw = e.decode('utf-8').lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    texts = [p_stemmer.stem(i) for i in stopped_tokens]
    return texts

In [6]:
def process_text_no_stem(e):
    raw = e.decode('utf-8').lower()
    tokens = tokenizer.tokenize(raw)
    return tokens

In [7]:
def label_to_num(ys):
    y_true = []
    for l in ys:
        if l == 'Positive':
            y_true.append(0)
        if l == 'Neutral':
            y_true.append(1)
        if l == 'Negative':
            y_true.append(2)
    return y_true

In [8]:
def create_feature_vector(tokens, candidate, location, subject):
    # returns the feature vector that represents this
    v = np.zeros(len(vocabulary_set) + 3)
    for t in tokens:
        v[vocabulary_set.index(t)] = 1
    # get candidate id
    v[len(vocabulary_set)] = candidate_set.index(candidate)
    v[len(vocabulary_set) + 1] = location_set.index(location)
    v[len(vocabulary_set) + 2] = subject_set.index(subject)
    return v

In [9]:
def create_feature_vector_expanded(tokens, candidate, location, subject):
    # returns the feature vector that represents this
    v = np.zeros(len(vocabulary_set) + len(candidate_set) + len(subject_set) + len(location_set))
    for t in tokens:
        v[vocabulary_set.index(t)] = 1

    # get candidate id
    v[len(vocabulary_set) + candidate_set.index(candidate)] = 1
    v[len(vocabulary_set) + len(candidate_set) + location_set.index(location)] = 1
    v[len(vocabulary_set) + len(candidate_set) + len(location_set) + subject_set.index(subject)] = 1
    return v

In [10]:
############################## End Library Functions #########################################

In [11]:
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

en_stop = get_stop_words('en')

In [12]:
dftrain = pd.read_csv('output/Sentiment.csv')

In [13]:
df = dftrain['text']
texts_all = []
for e in df:
    raw = e.decode('utf-8').lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if i not in en_stop]
    texts = [p_stemmer.stem(i) for i in stopped_tokens]
    texts_all.append(texts)

In [14]:
vocabulary = [item for sublist in texts_all for item in sublist]
# print vocabulary
vocabulary_set = list(set(vocabulary))

In [15]:
candidate_list = []
for c in dftrain['candidate']:
    candidate_list.append(c)
candidate_set = list(set(candidate_list))

subject_list = []
for s in dftrain['subject_matter']:
    subject_list.append(s)
subject_set = list(set(subject_list))

location_list = []
for l in dftrain['tweet_location']:
    location_list.append(l)
location_set = list(set(location_list))

In [16]:
# get a partial set
X_train, X_test, y_train, y_test = cross_validation.train_test_split(dftrain['text'], dftrain['sentiment'],
                                                                    test_size=0.4, random_state=0)

In [25]:
# build an LDA model using this X_train set
m = np.array([np.zeros(len(vocabulary_set)) for i in range(len(X_train))])
for i in range(len(X_train)):
    tokens = process_text(X_train.iloc[i])
    for t in tokens:
        m[i][vocabulary_set.index(t)] = 1

In [32]:
# we have a matrix, lets use LDA on it.
model = LDA(n_topics=10, random_state=0)
model.fit(m.astype(int))

<lda.lda.LDA instance at 0x10b700b00>

In [55]:
model.doc_topic_.shape

(8322, 10)

In [41]:
# we have the LDA vectors, which represent the probability of each topic. Lets feed this in as a matrix to a classifier
# see how the classifier performs given the dimension reduction that has happened.
clf = DecisionTreeClassifier(random_state=0)
clf.fit(model.doc_topic_, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [46]:
skf = StratifiedKFold(dftrain['sentiment'],n_folds=10)
averageError = 0.0

In [75]:
k = 10
skf = StratifiedKFold(label_to_num(dftrain['sentiment']),n_folds=k)
averageError = 0.0
for train_index, test_index in skf:
    X_train, X_test = dftrain['text'][train_index], dftrain['text'][test_index]
    y_train, y_test = dftrain['sentiment'][train_index], dftrain['sentiment'][test_index]
    
    # create the LDA transformation first
    m = np.array([np.zeros(len(vocabulary_set)) for i in range(len(X_train))])
    for i in range(len(X_train)):
        tokens = process_text(X_train.iloc[i])
        for t in tokens:
            m[i][vocabulary_set.index(t)] = 1
    
    model = LDA(n_topics=13, random_state=0)
    model.fit(m.astype(int))
    
    m_test = np.array([np.zeros(len(vocabulary_set)) for i in range(len(X_test))])
    for i in range(len(X_test)):
        tokens = process_text(X_test.iloc[i])
        for t in tokens:
            m_test[i][vocabulary_set.index(t)] = 1
            
    clf_X_test = model.transform(m_test.astype(int))
    
    clf = RandomForestClassifier(random_state=0)
    clf.fit(model.doc_topic_, y_train)
    y_pred = clf.predict(clf_X_test)
    error = zero_one_loss(label_to_num(y_test), label_to_num(y_pred))
    print error
    averageError += (1./k) * error
print "Average error: %4.2f%s" % (100 * averageError,'%')

0.434125269978
0.449244060475
0.438040345821
0.466474405191
0.459985580389
0.483056957462
0.395382395382
0.47113997114
0.406204906205
0.40404040404
Average error: 44.08%


In [61]:
print len(dftrain['sentiment'])

13871


In [63]:
print len(dftrain['text'])

13871


In [96]:
skf = StratifiedKFold(label_to_num(dftrain['sentiment']),n_folds=k)
for train_index, test_index in skf:
    print dftrain['subject_matter'][train_index[0]]

None of the above
None of the above
None of the above
None of the above
None of the above
None of the above
None of the above
None of the above
None of the above
None of the above


In [126]:
k = 10
skf = StratifiedKFold(label_to_num(dftrain['sentiment']),n_folds=k)
averageError = 0.0
for train_index, test_index in skf:
    dftrain_cols = zip(dftrain['text'], dftrain['candidate'], dftrain['tweet_location'], dftrain['subject_matter'])
    X_train, X_test = dftrain['text'][train_index], dftrain['text'][test_index]
    y_train, y_test = dftrain['sentiment'][train_index], dftrain['sentiment'][test_index]
    
    # create the LDA transformation first
    z_array = np.zeros(len(vocabulary_set) + len(candidate_set) + len(location_set) + len(subject_set))
    m = np.array([z_array for i in range(len(X_train))])
    for i in range(len(X_train)):
        tokens = process_text(X_train.iloc[i])
        candidate = dftrain['candidate'][train_index[i]]
        tweet_location = dftrain['tweet_location'][train_index[i]]
        sm = dftrain['subject_matter'][train_index[i]]
        m[i] = create_feature_vector_expanded(tokens, candidate, tweet_location, sm)
    
    model = LDA(n_topics=13, random_state=0)
    model.fit(m.astype(int))
    
    z_array = np.zeros(len(vocabulary_set) + len(candidate_set) + len(location_set) + len(subject_set))
    m_test = np.array([z_array for i in range(len(X_test))])
    for i in range(len(X_test)):
        tokens = process_text(X_test.iloc[i])
        candidate = dftrain['candidate'][test_index[i]]
        tweet_location = dftrain['tweet_location'][test_index[i]]
        sm = dftrain['subject_matter'][test_index[i]]
        m_test[i] = create_feature_vector_expanded(tokens, candidate, tweet_location, sm)
            
    clf_X_test = model.transform(m_test.astype(int))
    
    clf = RandomForestClassifier(random_state=0)
    clf.fit(model.doc_topic_, y_train)
    y_pred = clf.predict(clf_X_test)
    error = zero_one_loss(label_to_num(y_test), label_to_num(y_pred))
    print error
    averageError += (1./k) * error
print "Average error: %4.2f%s" % (100 * averageError,'%')

0.429085673146
0.462922966163
0.448847262248
0.452775775054
0.433309300649
0.381398702235
0.383838383838
0.4329004329
0.430014430014
0.450937950938
Average error: 43.06%


In [114]:
k = 10
skf = StratifiedKFold(label_to_num(dftrain['sentiment']),n_folds=k)
averageError = 0.0
for train_index, test_index in skf:
    dftrain_cols = zip(dftrain['text'], dftrain['candidate'], dftrain['tweet_location'], dftrain['subject_matter'])
    X_train, X_test = dftrain['text'][train_index], dftrain['text'][test_index]
    y_train, y_test = dftrain['sentiment'][train_index], dftrain['sentiment'][test_index]
    
    # create the LDA transformation first
    m = np.array([np.zeros(len(vocabulary_set) + 3) for i in range(len(X_train))])
    for i in range(len(X_train)):
        tokens = process_text(X_train.iloc[i])
        candidate = dftrain['candidate'][train_index[i]]
        tweet_location = dftrain['tweet_location'][train_index[i]]
        sm = dftrain['subject_matter'][train_index[i]]
        m[i] = create_feature_vector(tokens, candidate, tweet_location, sm)
    print m
#     model = LDA(n_topics=13, random_state=0)
#     model.fit(m.astype(int))
    
    m_test = np.array([np.zeros(len(vocabulary_set) + 3) for i in range(len(X_test))])
    for i in range(len(X_test)):
        tokens = process_text(X_test.iloc[i])
        candidate = dftrain['candidate'][test_index[i]]
        tweet_location = dftrain['tweet_location'][test_index[i]]
        sm = dftrain['subject_matter'][test_index[i]]
        m_test[i] = create_feature_vector(tokens, candidate, tweet_location, sm)
    print m_test
#     clf_X_test = model.transform(m_test.astype(int))
    
    clf = RandomForestClassifier(random_state=0)
    clf.fit(m, y_train)
    y_pred = clf.predict(m_test)
    error = zero_one_loss(label_to_num(y_test), label_to_num(y_pred))
    print error
    averageError += (1./k) * error
print "Average error: %4.2f%s" % (100 * averageError,'%')

[[    0.     0.     0. ...,     8.  1962.    12.]
 [    0.     0.     0. ...,    11.  2078.    12.]
 [    0.     0.     0. ...,     8.  3307.    12.]
 ..., 
 [    0.     0.     0. ...,    11.     0.    12.]
 [    0.     0.     0. ...,     5.     0.    11.]
 [    0.     0.     0. ...,    11.  2201.    12.]]
[[    0.     0.     0. ...,     8.     0.    12.]
 [    0.     0.     0. ...,     4.     0.    12.]
 [    0.     0.     0. ...,     8.     0.    12.]
 ..., 
 [    0.     0.     0. ...,     8.  1670.    12.]
 [    0.     0.     0. ...,     5.  2691.    12.]
 [    0.     0.     0. ...,     8.     0.    11.]]
0.347732181425
[[    0.     0.     0. ...,     8.     0.    12.]
 [    0.     0.     0. ...,     4.     0.    12.]
 [    0.     0.     0. ...,     8.     0.    12.]
 ..., 
 [    0.     0.     0. ...,    11.     0.    12.]
 [    0.     0.     0. ...,     5.     0.    11.]
 [    0.     0.     0. ...,    11.  2201.    12.]]
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   8

In [None]:
k = 10
skf = StratifiedKFold(label_to_num(dftrain['sentiment']),n_folds=k)
averageError = 0.0
for train_index, test_index in skf:
    dftrain_cols = zip(dftrain['text'], dftrain['candidate'], dftrain['tweet_location'], dftrain['subject_matter'])
    X_train, X_test = dftrain['text'][train_index], dftrain['text'][test_index]
    y_train, y_test = dftrain['sentiment'][train_index], dftrain['sentiment'][test_index]
    
    # create the LDA transformation first
    m = np.array([np.zeros(len(vocabulary_set) + 3) for i in range(len(X_train))])
    for i in range(len(X_train)):
        tokens = process_text(X_train.iloc[i])
        candidate = dftrain['candidate'][train_index[i]]
        tweet_location = dftrain['tweet_location'][train_index[i]]
        sm = dftrain['subject_matter'][train_index[i]]
        m[i] = create_feature_vector(tokens, candidate, tweet_location, sm)
    print m
#     model = LDA(n_topics=13, random_state=0)
#     model.fit(m.astype(int))
    
    m_test = np.array([np.zeros(len(vocabulary_set) + 3) for i in range(len(X_test))])
    for i in range(len(X_test)):
        tokens = process_text(X_test.iloc[i])
        candidate = dftrain['candidate'][test_index[i]]
        tweet_location = dftrain['tweet_location'][test_index[i]]
        sm = dftrain['subject_matter'][test_index[i]]
        m_test[i] = create_feature_vector(tokens, candidate, tweet_location, sm)
    print m_test
#     clf_X_test = model.transform(m_test.astype(int))
    
    clf = SVC(kernel=chi2_kernel)
    clf.fit(m, y_train)
    y_pred = clf.predict(m_test)
    error = zero_one_loss(label_to_num(y_test), label_to_num(y_pred))
    print error
    averageError += (1./k) * error
print "Average error: %4.2f%s" % (100 * averageError,'%')

In [35]:
## building a neural network system to test with here as well.
def createNetwork(inputDim, hiddenDim, outputDim):
    n = RecurrentNetwork()
    n.addInputModule(LinearLayer(inputDim, name='in'))
    n.addModule(SigmoidLayer(hiddenDim, name='hidden'))
    n.addModule(SigmoidLayer(hiddenDim, name='hidden1'))
    n.addModule(SigmoidLayer(hiddenDim, name='hidden2'))
    n.addOutputModule(SoftmaxLayer(outputDim, name='out'))
    n.addConnection(FullConnection(n['in'], n['hidden'], name='c1'))
    n.addConnection(FullConnection(n['hidden'], n['hidden1'], name='c2'))
    n.addConnection(FullConnection(n['hidden1'], n['hidden2'], name='c3'))
    n.addConnection(FullConnection(n['hidden2'], n['out'], name='c4'))
    n.addRecurrentConnection(FullConnection(n['hidden2'], n['hidden'], name='c5'))
    n.sortModules()
    return n

In [None]:
k = 10
skf = StratifiedKFold(label_to_num(dftrain['sentiment']),n_folds=k)
averageError = 0.0
for train_index, test_index in skf:
    dftrain_cols = zip(dftrain['text'], dftrain['candidate'], dftrain['tweet_location'], dftrain['subject_matter'])
    X_train, X_test = dftrain['text'][train_index], dftrain['text'][test_index]
    y_train, y_test = dftrain['sentiment'][train_index], dftrain['sentiment'][test_index]
    
    # create the LDA transformation first
    input_dim = len(vocabulary_set) + len(candidate_set) + len(location_set) + len(subject_set)
    trndata = ClassificationDataSet(input_dim, nb_classes=3)
    for i in range(len(X_train)):
        tokens = process_text(X_train.iloc[i])
        candidate = dftrain['candidate'][train_index[i]]
        tweet_location = dftrain['tweet_location'][train_index[i]]
        sm = dftrain['subject_matter'][train_index[i]]
        l = label_to_num([y_train.iloc[i]])[0]
        trndata.addSample(create_feature_vector_expanded(tokens, candidate, tweet_location, sm), [l])
    
    print trndata    
    
    testdata = ClassificationDataSet(input_dim, nb_classes=3)
    for i in range(len(X_test)):
        tokens = process_text(X_test.iloc[i])
        candidate = dftrain['candidate'][test_index[i]]
        tweet_location = dftrain['tweet_location'][test_index[i]]
        sm = dftrain['subject_matter'][test_index[i]]
        l = label_to_num([y_test.iloc[i]])[0]
        testdata.addSample(create_feature_vector_expanded(tokens, candidate, tweet_location, sm), [l])
    
    print testdata
    # convert to one of many
    trndata._convertToOneOfMany()
    testdata._convertToOneOfMany()
    # train the network
    n = createNetwork(trndata.indim, 100, trndata.outdim)
    print n
    trainer = BackpropTrainer(n, dataset=trndata, momentum=0.1, verbose=True, weightdecay=0.01)
    
    # train for a certain amount of epochs
    for i in range(20):
        print "Training Epoch: {0}".format(i)
        trainer.trainEpochs(5)
        trnresult = percentError( trainer.testOnClassData(),
                              trndata['class'] )
        tstresult = percentError( trainer.testOnClassData(
               dataset=testdata ), testdata['class'] )

        print "epoch: %4d" % trainer.totalepochs, \
              "  train error: %5.2f%%" % trnresult, \
              "  test error: %5.2f%%" % tstresult
    error = percentError(trainer.testOnClassData(dataset=testdata), testdata['class'])
    averageError += (1./k) * error
print "Average error: %4.2f%s" % (100 * averageError,'%')

input: dim(16382, 19668)
[[ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]]

target: dim(16382, 1)
[[0]
 [0]
 [0]
 ..., 
 [0]
 [2]
 [0]]

class: dim(0, 1)
[]


input: dim(2046, 19668)
[[ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  1.  0.]]

target: dim(2046, 1)
[[1]
 [0]
 [1]
 ..., 
 [2]
 [2]
 [2]]

class: dim(0, 1)
[]


RecurrentNetwork-20
   Modules:
    [<LinearLayer 'in'>, <SigmoidLayer 'hidden'>, <SigmoidLayer 'hidden1'>, <SigmoidLayer 'hidden2'>, <SoftmaxLayer 'out'>]
   Connections:
    [<FullConnection 'c1': 'in' -> 'hidden'>, <FullConnection 'c2': 'hidden' -> 'hidden1'>, <FullConnection 'c3': 'hidden1' -> 'hidden2'>, <FullConnection 'c4': 'hidden2' -> 'out'>]
   Recurrent Connections:
    