In [1]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

In [2]:
tweets = pd.read_pickle('./Data/Cleaned/xrp_cleaned_date.pkl')

In [3]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25435 entries, 0 to 25434
Data columns (total 14 columns):
ID                  25435 non-null float64
datetime            25435 non-null datetime64[ns]
is_reply            25435 non-null float64
is_retweet          25435 non-null float64
nbr_favorite        25435 non-null float64
nbr_reply           25435 non-null float64
nbr_retweet         25435 non-null float64
text                25435 non-null object
url                 25435 non-null object
user_id             25435 non-null object
usernameTweet       25435 non-null object
has_media           8834 non-null float64
medias              8834 non-null object
rounded_dateTime    25435 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(7), object(5)
memory usage: 2.7+ MB


In [4]:
# URLS
tweets['text'] = tweets['text'].apply(lambda t : re.sub('https?://[A-Za-z0-9./]+',' ',t))

#Mentions
tweets['text'] = tweets['text'].apply(lambda t : re.sub(r'@[A-Za-z0-9]+',' ',t))

#Hashtags
tweets['text'] = tweets['text'].apply(lambda t : re.sub("[^a-zA-Z]", " ", t))

In [5]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [6]:
cleaned = tweets['text'].apply(lambda t : tweet_cleaner(t))

In [7]:
from textblob import TextBlob
 
class TwitterClient(object):
    '''
    Generic Twitter Class for sentiment analysis.
    '''
    def __init__(self):
        '''
        Class constructor or initialization method.
        '''
 
    def clean_tweet(self, tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])"\
                                    "|(\w+:\/\/\S+)", " ", tweet).split())
 
    def get_tweet_sentiment(self, tweet):
        '''
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        '''
        # create TextBlob object of passed tweet text
        analysis = TextBlob(self.clean_tweet(tweet))
        # set sentiment
        if analysis.sentiment.polarity > 0:
            return 'positive'
        elif analysis.sentiment.polarity == 0:
            return 'neutral'
        else:
            return 'negative'

In [8]:
tc = TwitterClient()

In [12]:
cleaned2 = pd.DataFrame(cleaned)

In [21]:
cleaned2['tc_cleaned'] = tweets['text'].apply(lambda t : tc.clean_tweet(t))

In [22]:
cleaned2['tc_sentiment'] = cleaned2['tc_cleaned'].apply(lambda t : tc.get_tweet_sentiment(t))

In [31]:
cleaned2.head(10)

Unnamed: 0,text,tc_cleaned,tc_sentiment
0,new post could ripple xrp really reach in days...,New post Could Ripple XRP Really Reach In Days...,positive
1,did you know that you can buy sell store conve...,Did you know that you can buy sell store conve...,negative
2,ripple usd ripple has changed by usd in mins l...,Ripple USD Ripple has changed by USD in mins L...,positive
3,batteries ito ico token tokensale invest crypt...,Batteries ITO ICO Token TokenSale Invest Crypt...,neutral
4,how is ripple different from all other cryptoc...,How Is Ripple Different From All Other Cryptoc...,negative
5,walmart and moneygram partnership could pump r...,Walmart And Moneygram Partnership Could Pump R...,neutral
6,this is not fake news look at our references w...,This is not fake news Look at our references W...,positive
7,top cryptocurrencies current prices btc bitcoi...,Top Cryptocurrencies Current Prices BTC Bitcoi...,positive
8,ripple price alert the last ask price for xrp ...,Ripple price alert The last ask price for xrp ...,neutral
9,xrp amp ripple the world s financial infrastru...,XRP amp Ripple The World s Financial Infrastru...,neutral


In [24]:
cleaned2.iloc[0]['text']

'new post could ripple xrp really reach in days has been published on https masscryptocurrency com p pic twitter com dlog tnqe'

In [25]:
cleaned2.iloc[1]['text']

'did you know that you can buy sell store convert and transfer xrp on http coindirect com in the uk eu countries australia kenya nigeria and south africa you can use fiat currency or convert other coins and exchange them for ripple'

In [26]:
cleaned2.iloc[3]['text']

'batteries ito ico token tokensale invest crypto cryptocurrency bitcoin ethereum ripple btc eth xrp pic twitter com wumors tcm'

In [28]:
cleaned2.iloc[2]['text']

'ripple usd ripple has changed by usd in mins live price https is gd hi oca ripple xrp cryptocurrency'

In [29]:
cleaned2.iloc[4]['text']

'how is ripple different from all other cryptocurrencies an ultimate guide reddit com r ripple comme nts d hr how is ripple different from all other ripple tron trx xrp eth btc xmr ltc litecoin etc eos neo xlm ada cardano nem iota lsk icx cryptonews redbux'

In [30]:
cleaned2.iloc[5]['text']

'walmart and moneygram partnership could pump ripple xrp https ethereumworldnews com walmart and mo neygram partnership could pump ripple xrp'