In [57]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
import time

In [30]:
tweets = pd.read_pickle('./Data/Cleaned/xrp_cleaned_date.pkl')

In [31]:
priceDelta = pd.read_pickle('./Data/Cleaned/xrp_delta.pkl')

In [35]:
tweets['timeId'] = tweets['rounded_dateTime'].apply(lambda t : time.mktime(t.timetuple()))

In [28]:
tweets['rounded_dateTime'].describe()

count                   25435
unique                    267
top       2018-04-16 13:00:00
freq                      221
first     2018-04-13 08:00:00
last      2018-04-24 10:00:00
Name: rounded_dateTime, dtype: object

In [36]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25435 entries, 0 to 25434
Data columns (total 15 columns):
ID                  25435 non-null float64
datetime            25435 non-null datetime64[ns]
is_reply            25435 non-null float64
is_retweet          25435 non-null float64
nbr_favorite        25435 non-null float64
nbr_reply           25435 non-null float64
nbr_retweet         25435 non-null float64
text                25435 non-null object
url                 25435 non-null object
user_id             25435 non-null object
usernameTweet       25435 non-null object
has_media           8834 non-null float64
medias              8834 non-null object
rounded_dateTime    25435 non-null datetime64[ns]
timeId              25435 non-null float64
dtypes: datetime64[ns](2), float64(8), object(5)
memory usage: 2.9+ MB


In [37]:
tweets = pd.merge(tweets, priceDelta, how='left', on='timeId')

In [39]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25435 entries, 0 to 25434
Data columns (total 17 columns):
ID                  25435 non-null float64
datetime            25435 non-null datetime64[ns]
is_reply            25435 non-null float64
is_retweet          25435 non-null float64
nbr_favorite        25435 non-null float64
nbr_reply           25435 non-null float64
nbr_retweet         25435 non-null float64
text                25435 non-null object
url                 25435 non-null object
user_id             25435 non-null object
usernameTweet       25435 non-null object
has_media           8834 non-null float64
medias              8834 non-null object
rounded_dateTime    25435 non-null datetime64[ns]
timeId              25435 non-null float64
time                25435 non-null datetime64[ns]
pDelta              25435 non-null float64
dtypes: datetime64[ns](3), float64(9), object(5)
memory usage: 3.5+ MB


In [40]:
tweets['rounded_dateTime'].describe(include='all')

count                   25435
unique                    267
top       2018-04-16 13:00:00
freq                      221
first     2018-04-13 08:00:00
last      2018-04-24 10:00:00
Name: rounded_dateTime, dtype: object

In [41]:
# URLS
tweets['text'] = tweets['text'].apply(lambda t : re.sub('https?://[A-Za-z0-9./]+',' ',t))

#Mentions
tweets['text'] = tweets['text'].apply(lambda t : re.sub(r'@[A-Za-z0-9]+',' ',t))

#Hashtags
tweets['text'] = tweets['text'].apply(lambda t : re.sub("[^a-zA-Z]", " ", t))

In [42]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [43]:
tweets['text'] = tweets['text'].apply(lambda t : tweet_cleaner(t))

In [44]:
from textblob import TextBlob
 
class TwitterClient(object):
    '''
    Generic Twitter Class for sentiment analysis.
    '''
    def __init__(self):
        '''
        Class constructor or initialization method.
        '''
 
    def clean_tweet(self, tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])"\
                                    "|(\w+:\/\/\S+)", " ", tweet).split())
 
    def get_tweet_sentiment(self, tweet):
        '''
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        '''
        # create TextBlob object of passed tweet text
        analysis = TextBlob(self.clean_tweet(tweet))
        # set sentiment
        if analysis.sentiment.polarity > 0:
            return 'positive'
        elif analysis.sentiment.polarity == 0:
            return 'neutral'
        else:
            return 'negative'

In [46]:
tc = TwitterClient()

In [47]:
cleaned2 = pd.DataFrame(tweets)

In [48]:
cleaned2['tc_cleaned'] = tweets['text'].apply(lambda t : tc.clean_tweet(t))

In [49]:
cleaned2['tc_sentiment'] = cleaned2['tc_cleaned'].apply(lambda t : tc.get_tweet_sentiment(t))

In [50]:
cleaned2.head(10)

Unnamed: 0,ID,datetime,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet,has_media,medias,rounded_dateTime,timeId,time,pDelta,tc_cleaned,tc_sentiment
0,9.885302e+17,2018-04-23 17:29:08,0.0,0.0,0.0,0.0,0.0,new post could ripple xrp really reach in days...,/ExcelVisibility/status/988530235493748736,815322320,ExcelVisibility,,,2018-04-23 18:00:00,1524521000.0,2018-04-23 18:00:00,0.002178,new post could ripple xrp really reach in days...,positive
1,9.847927e+17,2018-04-13 09:57:41,0.0,0.0,0.0,0.0,0.0,did you know that you can buy sell store conve...,/coindirectcom/status/984792745666580481,915453520924545025,coindirectcom,1.0,[https://t.co/h1CTJHGwPs],2018-04-13 10:00:00,1523628000.0,2018-04-13 10:00:00,-0.004874,did you know that you can buy sell store conve...,negative
2,9.8638e+17,2018-04-17 19:05:02,0.0,0.0,0.0,0.0,0.0,ripple usd ripple has changed by usd in mins l...,/RippleMarket/status/986380040517701633,848989770923601922,RippleMarket,,,2018-04-17 20:00:00,1524010000.0,2018-04-17 20:00:00,0.00733,ripple usd ripple has changed by usd in mins l...,positive
3,9.85341e+17,2018-04-14 22:16:19,0.0,0.0,0.0,0.0,1.0,batteries ito ico token tokensale invest crypt...,/nsdelpitiya/status/985341014339608576,880003387672088576,nsdelpitiya,,,2018-04-14 23:00:00,1523761000.0,2018-04-14 23:00:00,-0.003548,batteries ito ico token tokensale invest crypt...,neutral
4,9.866047e+17,2018-04-18 09:57:42,0.0,0.0,0.0,0.0,0.0,how is ripple different from all other cryptoc...,/Matthix191/status/986604686856933376,916641996823031808,Matthix191,1.0,[https://t.co/xF5l49lz8P],2018-04-18 10:00:00,1524060000.0,2018-04-18 10:00:00,-0.014478,how is ripple different from all other cryptoc...,negative
5,9.852602e+17,2018-04-14 16:55:20,0.0,0.0,0.0,0.0,0.0,walmart and moneygram partnership could pump r...,/vandecrypto/status/985260236238934018,943489871758405632,vandecrypto,1.0,[https://t.co/LxwR2chVOy],2018-04-14 17:00:00,1523740000.0,2018-04-14 17:00:00,-0.003227,walmart and moneygram partnership could pump r...,neutral
6,9.860333e+17,2018-04-16 20:07:13,1.0,0.0,0.0,1.0,0.0,this is not fake news look at our references w...,/sniperstube/status/986033300136316928,921966736303362048,sniperstube,,,2018-04-16 21:00:00,1523927000.0,2018-04-16 21:00:00,-0.004528,this is not fake news look at our references w...,positive
7,9.865902e+17,2018-04-18 09:00:00,0.0,0.0,1.0,0.0,0.0,top cryptocurrencies current prices btc bitcoi...,/CryptoGulp/status/986590166381887488,945716745192574977,CryptoGulp,,,2018-04-18 10:00:00,1524060000.0,2018-04-18 10:00:00,-0.014478,top cryptocurrencies current prices btc bitcoi...,positive
8,9.873616e+17,2018-04-20 12:05:20,0.0,0.0,1.0,0.0,0.0,ripple price alert the last ask price for xrp ...,/ripplebot_cs/status/987361585122480128,944577421008875521,ripplebot_cs,,,2018-04-20 13:00:00,1524244000.0,2018-04-20 13:00:00,0.000232,ripple price alert the last ask price for xrp ...,neutral
9,9.851089e+17,2018-04-14 06:54:01,0.0,0.0,1.0,0.0,0.0,xrp amp ripple the world s financial infrastru...,/btc_current/status/985108911312093184,952620879443320832,btc_current,1.0,[https://t.co/dKr58pbYhj],2018-04-14 07:00:00,1523704000.0,2018-04-14 07:00:00,-0.000314,xrp amp ripple the world s financial infrastru...,neutral


In [51]:
preds_df = cleaned2[['pDelta', 'tc_sentiment']]

In [53]:
preds_df.describe()

Unnamed: 0,pDelta
count,25435.0
mean,0.001191
std,0.013683
min,-0.059803
25%,-0.005436
50%,0.000677
75%,0.007194
max,0.059349


In [59]:
def deltaConvert(d):
    if d > 0.005:
        return 'positive'
    elif d < -0.005:
        return 'negative'
    else:
        return 'neutral'

In [60]:
preds_df['actual'] = preds_df['pDelta'].apply(lambda p : deltaConvert(p))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [61]:
print(classification_report(preds_df['actual'], preds_df['tc_sentiment']))

             precision    recall  f1-score   support

   negative       0.24      0.07      0.10      6616
    neutral       0.42      0.52      0.47     10619
   positive       0.33      0.42      0.37      8200

avg / total       0.35      0.37      0.34     25435

