In [1]:
import pandas as pd
import string
import numpy as np

from sklearn import svm
from sklearn.feature_extraction import DictVectorizer
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import bigrams
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [2]:
class Classifier(object):
    def __init__(self):
        '''
        Class constructor or initialization method.
        '''
    def fit(self, algorithm='SVM', c = 1.0, kernel='rbf'):
        v = DictVectorizer()
        self.X = v.fit_transform(self.x)
        if algorithm == 'SVM':
            self.clf = svm.SVC(C=c,kernel=kernel)
            self.clf.fit(self.X,self.y)
        elif algorithm == 'NN':
            X_train, X_test, y_train, y_test = train_test_split(self.X, self.y)
            #scaler = StandardScaler()
            #scaler.fit(X_train)
            #X_train = scaler.transform(X_train)
            #X_test = scaler.transform(X_test)
            mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))
            mlp.fit(X_train,y_train)
            predictions = mlp.predict(X_test)
            print(confusion_matrix(y_test,predictions))
            print(classification_report(y_test,predictions))
            
    
    def predict(self):
        self.clf.fit(self.X, self.y)
        for x, y in zip(self.X,self.y):
            print("{} {}".format(self.clf.predict(x),y))
    
    def validate(self, splits=10, test=0.1):
        cv = ShuffleSplit(n_splits=splits, test_size=test)#, random_state=0)
        scores = cross_val_score(self.clf, self.X, self.y, cv=cv)
        return scores.mean()
        
    def train_score(self):
        return self.clf.score(self.X,self.y)
        
    def load_data(self, tweets, prices, threshold=0):
        # Check market change
        prices['grew'] = np.select([prices['close']>prices['open']*(1+threshold),
                             prices['close']<prices['open']*(1-threshold)],[1,-1],default=0)
        
        (prices['close']-prices['open']) > 0
        hours = prices['time']
        
        self.x ={}
        self.y ={}
        for hour in hours:
            self.x[hour] = defaultdict(int)
            self.y[hour] = price_df.loc[price_df['time']==hour].iloc[0]['grew']
            
        wnl = WordNetLemmatizer()
        stopset = stopwords.words('english')
        ignore = str.maketrans('', '', string.punctuation+string.digits)
        for index, row in tweets_df.iterrows():
            tag = row['rounded_dateTime']
            words = set()
            line = row['text'].translate(ignore)
            tokens = word_tokenize(line)
            tokens = [ wnl.lemmatize(token.lower()) for token in tokens ]
            tokens = [w for w in tokens if not w in stopset]
            bigrm = bigrams(tokens)
            for word in tokens:
                words.add(word)
            for word in words:
                self.x[tag][word] += 1
            for gram in bigrm:
                word = gram[0]+gram[1]
                self.x[tag][word] += 1
                
        self.x = [self.x[key] for key in sorted(self.x.keys())]
        self.y = [self.y[key] for key in sorted(self.y.keys())]
    
    def trim(self):
        new_x = []
        new_y = []
        for a, b in zip(self.x, self.y):
            if len(a) > 0:
                new_x.append(a)
                new_y.append(b)
        self.x=new_x
        self.y=new_y
    def shift(self, shift):
        self.x = self.x[:len(self.x)-shift]
        self.y = self.y[shift:]

In [9]:
tweets_df = pd.read_pickle('Data/Cleaned/tweets_prices.pkl')
price_df = pd.read_pickle('Data/Cleaned/xrp_price_data_hourly.pkl')

In [10]:
classifier = Classifier()
classifier.load_data(tweets_df, price_df, threshold=0.00)
classifier.trim()

In [11]:
classifier.fit(algorithm='NN')

[[19  6]
 [29 13]]
             precision    recall  f1-score   support

         -1       0.40      0.76      0.52        25
          1       0.68      0.31      0.43        42

avg / total       0.58      0.48      0.46        67



In [12]:
classifier.fit(c=2.0)
print(classifier.train_score())
#classifier.predict()
print(classifier.validate(splits=20))

0.599250936329588
0.5462962962962963
