In [31]:
import re
import pandas as pd
import numpy as np
import MySQLdb
import time
from scipy.ndimage.interpolation import shift
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier
from sklearn.svm import NuSVC, SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error
from sklearn import preprocessing

In [32]:
tStart = time.time()
start_timestamp = '1259769600' #1433088000 #1484870400 #1259769600
conn=MySQLdb.connect("54.65.19.253", "news", "newsnews", "news")
sql = "SELECT timestamp, tweet FROM trump WHERE timestamp > %s;"%(start_timestamp)
trump_data = pd.read_sql_query(sql, conn)
sql = "SELECT timestamp, usdx FROM usdx WHERE timestamp > %s;"%(start_timestamp)
usdx_data = pd.read_sql_query(sql, conn)
tEnd = time.time()
conn.close()
print("Fetch data time: " +str(tEnd-tStart)+ "sec.")

Fetch data time: 36.03579783439636sec.


In [33]:
trump_data.head()

Unnamed: 0,timestamp,tweet
0,1293638400,"WIshing everyone a happy, healthy and prospero..."
1,1293033600,Wishing everyone a very Happy Holiday season!
2,1292169600,"...Trump International Hotel Las Vegas, and Tr..."
3,1292169600,Those five hotels includeTrump International H...
4,1290528000,My best wishes to everyone for a Happy Thanksg...


In [34]:
trump_data.shape

(20584, 2)

In [35]:
trump_data = trump_data.groupby('timestamp', as_index=False).agg(lambda tweet: ' '.join(tweet))
Y = usdx_data.as_matrix(columns=['usdx']).reshape(-1)
usdx_data['gradient'] = np.gradient(Y)

In [36]:
def get_shift(usdx_data, shift_offset=2):
    g = usdx_data.as_matrix(columns=['gradient']).reshape(-1)
    for i in range(1, shift_offset+1):
        usdx_data['gradient_p_'+str(i)] = shift(g, i, cval=0)

In [37]:
get_shift(usdx_data, shift_offset=2)
usdx_data.head()

Unnamed: 0,timestamp,usdx,gradient,gradient_p_1,gradient_p_2
0,1259856000,100.217,0.1619,0.0,0.0
1,1260115200,100.3789,0.27475,0.1619,0.0
2,1260201600,100.7665,0.2245,0.27475,0.1619
3,1260288000,100.8279,0.03215,0.2245,0.27475
4,1260374400,100.8308,0.1382,0.03215,0.2245


In [38]:
data = pd.merge(trump_data, usdx_data, on='timestamp')

In [39]:
data.head()

Unnamed: 0,timestamp,tweet,usdx,gradient,gradient_p_1,gradient_p_2
0,1263225600,"Celebrity Apprentice returns to NBC, Sunday, 3...",100.7315,0.20705,-0.1016,-0.3916
1,1264003200,"from Donald Trump: ""I saw Lady Gaga last night...",101.6901,0.13415,0.3388,0.4553
2,1265817600,Donald Trump appeared on the final episode of ...,103.0298,-0.0945,-0.00325,-0.09375
3,1267718400,From Donald Trump: Andrea Bocelli @ Mar-a-Lago...,102.1641,-0.08555,0.13735,-0.03815
4,1268323200,The Celebrity Apprentice has a two-hour premie...,101.5599,-0.04325,-0.1825,-0.0692


In [40]:
print(data.shape)

(1552, 6)


In [41]:
target_names = ['decline','up']

In [42]:
def binary(Y):
    Y[np.where(Y > 0)] = 1
    Y[np.where(Y <= 0)] = 0
    Y = Y.astype('int64')
    f = np.bincount(Y)
    print(f)
    return Y

In [43]:
Y = data.as_matrix(columns=['gradient']).reshape(-1)
Y = binary(Y)

[730 822]


In [44]:
X_p = data.as_matrix(columns=['gradient_p_1', 'gradient_p_2'])
X_p[:,0] = binary(X_p[:,0])
X_p[:,1] = binary(X_p[:,1])

[729 823]
[743 809]


In [45]:
vectorizer = TfidfVectorizer(min_df=4, ngram_range=(1, 4))
X = vectorizer.fit_transform(data['tweet'].tolist())
X

<1552x16954 sparse matrix of type '<class 'numpy.float64'>'
	with 267299 stored elements in Compressed Sparse Row format>

In [46]:
with_trump = True
if with_trump:
    X = hstack([X, X.power(2), X.power(3), X.power(4), X_p])
else:
    X = X_p

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
print(X_train.shape)
print(X_test.shape)

(1241, 67818)
(311, 67818)


In [48]:
def create_model():
    model = NuSVC(kernel='rbf',
              nu=0.5,
              degree = 3,
              verbose=0)
    return model

In [49]:
cv = 5
model = create_model()
recall = cross_val_score(model, X, Y, cv=cv, n_jobs = -1, scoring='recall')
precision = cross_val_score(model, X, Y, cv=cv, n_jobs = -1, scoring='precision')
print('\nprecision:',np.mean(precision))
print('recall:', np.mean(recall))
print('acc: %.4f'%(np.mean(cross_val_score(model, X, Y, cv=cv, n_jobs = -1))))


precision: 0.723704423801
recall: 0.616592756837
acc: 0.6720


In [50]:
def show_result(X_test, y_test):
    print(confusion_matrix(y_test, model.predict(X_test), labels=[0,1]))
    print(classification_report(y_test, model.predict(X_test), target_names=target_names))

In [51]:
skf = StratifiedKFold(n_splits=cv, shuffle=True)
for train_idx, test_idx in skf.split(X, Y):
    print ("Running Fold")
    model = create_model()
    if type(X) != np.ndarray:
        model.fit(X.tocsr()[train_idx], Y[train_idx])
        show_result(X.tocsr()[test_idx], Y[test_idx])
    else:
        model.fit(X[train_idx], Y[train_idx])
        show_result(X[test_idx], Y[test_idx])

Running Fold
[[ 96  50]
 [ 40 125]]
             precision    recall  f1-score   support

       down       0.71      0.66      0.68       146
         up       0.71      0.76      0.74       165

avg / total       0.71      0.71      0.71       311

Running Fold
[[ 97  49]
 [ 53 112]]
             precision    recall  f1-score   support

       down       0.65      0.66      0.66       146
         up       0.70      0.68      0.69       165

avg / total       0.67      0.67      0.67       311

Running Fold
[[ 82  64]
 [128  36]]
             precision    recall  f1-score   support

       down       0.39      0.56      0.46       146
         up       0.36      0.22      0.27       164

avg / total       0.37      0.38      0.36       310

Running Fold
[[102  44]
 [ 48 116]]
             precision    recall  f1-score   support

       down       0.68      0.70      0.69       146
         up       0.72      0.71      0.72       164

avg / total       0.70      0.70      0.70       3