In [2]:
import collections
import heapq
import matplotlib.pyplot as plt
import numpy as np
import os

from matplotlib import offsetbox
from scipy.sparse import csr_matrix
from sklearn import cluster, datasets, decomposition, ensemble, lda, manifold, random_projection
from sklearn.decomposition import TruncatedSVD
from time import time

%matplotlib inline

In [3]:
# Skip parts that take a long time.
SKIP_LONG_PARTS = True

# Create a dense representation of the data.
CREATE_DENSE_ARRAY = False

NUM_SPEECHES = 2740

NUM_DEBATES = 38

In [4]:
WORD_INDEX = 0
WORD_FREQ = 1

# Used for building the sparse data matrix.
sparse_indptr = [0]
sparse_indices = []
sparse_data = []
vocabulary = {}

inverted_index = collections.defaultdict(set)

dense_data = None

print 'Reading data.'
for line_num, line in enumerate(open('speech_vectors.csv')):
    new_row = [(idx,float(freq)) for idx, freq in enumerate(line.strip().split(',')) if float(freq) > 0.0]
    for i,f in new_row:
        sparse_indices.append(i)
        sparse_data.append(f)
        inverted_index[i].add(line_num)
    sparse_indptr.append(len(sparse_indices))

    
    if line_num % 150 == 0:
        print 100.0 * line_num / 2740, '%'
print 100.0 * line_num / 2740, '%'

sparse_data = csr_matrix((sparse_data, sparse_indices, sparse_indptr), dtype=float)
print 'Done reading data.'

print sparse_data.shape

Reading data.
0.0 %
5.47445255474 %
10.9489051095 %
16.4233576642 %
21.897810219 %
27.3722627737 %
32.8467153285 %
38.3211678832 %
43.795620438 %
49.2700729927 %
54.7445255474 %
60.2189781022 %
65.6934306569 %
71.1678832117 %
76.6423357664 %
82.1167883212 %
87.5912408759 %
93.0656934307 %
98.5401459854 %
99.9635036496 %
Done reading data.
(2740, 50000)


In [6]:
speech_graph = []
for line in open('speech_graph.csv'):
    speech_graph.append( [float(i) for i in line.strip().split(',')] )
    
print len(speech_graph)

2740


## SVM Classification

In [70]:
from sklearn import svm, grid_search
from sklearn.decomposition import PCA, SparsePCA

# run PCA on data
truncated_svd = TruncatedSVD(n_components=10)
reduced_data = truncated_svd.fit_transform(sparse_data)

# form the training data
X = reduced_data[[2, 13, 18, 24, 1, 3, 27, 177], :]
y = [0 for i in range(4)] + [1 for i in range(4)]

# # run PCA on data
# pca = PCA(n_components=50)
# denseX = X.toarray()
# reducedX = pca.fit(denseX).transform(denseX)
# X = csr_matrix(reducedX)

# fit SVM, searching over paramaters
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(X, y)

# make predictions
predictions = []
ones = 0
print len(reduced_data)
for i in range(len(reduced_data)):
    prediction = clf.predict(reduced_data[i])[0]
    if prediction == 1:
        ones += 1
    predictions += [prediction]

clf = grid_search.GridSearchCV(svr, parameters)

print predictions
print float(ones)/len(predictions)

2740
[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
"""
2, 13, 18, 24 are examples of speeches that belong to category “Against" (=label 0) and speeches 1, 3, 27, 177 

"""