In [1]:
# utilities
import pandas as pd
import numpy as np
import re
import string
string.punctuation
# nltk
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# from nltk.stem import PorterStemmer

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read and Load the dataset

tweets_df = pd.read_csv('Eth_Twitter_Data.csv',
                 index_col='created_at',
                 infer_datetime_format=True,
                 parse_dates=True
)

In [3]:
tweets_df.head(10)

Unnamed: 0_level_0,tweet
created_at,Unnamed: 1_level_1
2013-01-30 17:07:29,Attention ye who have not children: Chooseth w...
2013-09-05 19:35:25,Unite knows something we dont... something bet...
2013-10-02 23:17:18,ok i have to ask... why do people favorite twe...
2013-10-13 16:37:17,M(eth)iley C(rack)yrus
2013-10-23 20:24:45,get a sled and ride it out homie
2013-10-23 20:26:11,tell me i didnt just hit methodz with some dee...
2013-12-08 17:41:48,. if you get a million ill start smoking
2014-01-12 23:53:00,give it a month and it will all go away. Shoul...
2014-01-23 12:05:09,we're about to get roasted and people might no...
2014-01-26 22:32:51,tK would be needing 1


In [4]:
tweets_df.shape

(86336, 1)

In [5]:
tweets_df.dtypes

tweet    object
dtype: object

In [6]:
tweets_df.isnull().sum()

tweet    0
dtype: int64

In [7]:
tweets_df.dropna(inplace=True)

In [8]:
tweets_df.shape

(86336, 1)

In [9]:
tweets_df.drop_duplicates(inplace=True)
tweets_df.shape[0]

78533

In [10]:
# function to clean the tweets
def cleanTwt(twt):
    twt = re.sub('@[A-Za-z0-9]+', ' ', twt)
    twt = re.sub('[^A-Za-z0-9]+', ' ', twt) # Add a new regex
    twt = re.sub('\\n', '', twt)
    twt = re.sub('\\t', '', twt)
    twt = re.sub('(www.[^s]+)|(https?:\/\/\S+)', ' ', twt)
    twt = twt.lower()
    return twt

In [11]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [12]:
#storing the puntuation free text
tweets_df['cleaned_tweets'] = tweets_df['tweet'].apply(remove_punctuation)

In [13]:
tweets_df.head(10)

Unnamed: 0_level_0,tweet,cleaned_tweets
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-30 17:07:29,Attention ye who have not children: Chooseth w...,Attention ye who have not children Chooseth wi...
2013-09-05 19:35:25,Unite knows something we dont... something bet...,Unite knows something we dont something better...
2013-10-02 23:17:18,ok i have to ask... why do people favorite twe...,ok i have to ask why do people favorite tweets...
2013-10-13 16:37:17,M(eth)iley C(rack)yrus,Methiley Crackyrus
2013-10-23 20:24:45,get a sled and ride it out homie,get a sled and ride it out homie
2013-10-23 20:26:11,tell me i didnt just hit methodz with some dee...,tell me i didnt just hit methodz with some dee...
2013-12-08 17:41:48,. if you get a million ill start smoking,if you get a million ill start smoking
2014-01-12 23:53:00,give it a month and it will all go away. Shoul...,give it a month and it will all go away Should...
2014-01-23 12:05:09,we're about to get roasted and people might no...,were about to get roasted and people might not...
2014-01-26 22:32:51,tK would be needing 1,tK would be needing 1


In [14]:
# Clean the tweets
tweets_df['cleaned_tweets'] = tweets_df['cleaned_tweets'].apply(cleanTwt)

In [15]:
tweets_df.head(10)

Unnamed: 0_level_0,tweet,cleaned_tweets
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-30 17:07:29,Attention ye who have not children: Chooseth w...,attention ye who have not children chooseth wi...
2013-09-05 19:35:25,Unite knows something we dont... something bet...,unite knows something we dont something better...
2013-10-02 23:17:18,ok i have to ask... why do people favorite twe...,ok i have to ask why do people favorite tweets...
2013-10-13 16:37:17,M(eth)iley C(rack)yrus,methiley crackyrus
2013-10-23 20:24:45,get a sled and ride it out homie,get a sled and ride it out homie
2013-10-23 20:26:11,tell me i didnt just hit methodz with some dee...,tell me i didnt just hit methodz with some dee...
2013-12-08 17:41:48,. if you get a million ill start smoking,if you get a million ill start smoking
2014-01-12 23:53:00,give it a month and it will all go away. Shoul...,give it a month and it will all go away should...
2014-01-23 12:05:09,we're about to get roasted and people might no...,were about to get roasted and people might not...
2014-01-26 22:32:51,tK would be needing 1,tk would be needing 1


In [16]:
# Create a function to get the sentiment text
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [17]:
# Create a function to get the subjectivity
def getSubjectivity(twt):
    return TextBlob(twt).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(twt):
    return TextBlob(twt).sentiment.polarity

In [18]:
# Create two new columns called 'Subjectivity' and 'Polarity'
tweets_df['subjectivity'] = tweets_df['cleaned_tweets'].apply(getSubjectivity)
tweets_df['polarity'] = tweets_df['cleaned_tweets'].apply(getPolarity)

In [19]:
tweets_df.head(10)

Unnamed: 0_level_0,tweet,cleaned_tweets,subjectivity,polarity
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-30 17:07:29,Attention ye who have not children: Chooseth w...,attention ye who have not children chooseth wi...,0.9,0.7
2013-09-05 19:35:25,Unite knows something we dont... something bet...,unite knows something we dont something better...,0.416667,0.375
2013-10-02 23:17:18,ok i have to ask... why do people favorite twe...,ok i have to ask why do people favorite tweets...,0.75,0.1875
2013-10-13 16:37:17,M(eth)iley C(rack)yrus,methiley crackyrus,0.0,0.0
2013-10-23 20:24:45,get a sled and ride it out homie,get a sled and ride it out homie,0.0,0.0
2013-10-23 20:26:11,tell me i didnt just hit methodz with some dee...,tell me i didnt just hit methodz with some dee...,0.6,-0.1
2013-12-08 17:41:48,. if you get a million ill start smoking,if you get a million ill start smoking,1.0,-0.5
2014-01-12 23:53:00,give it a month and it will all go away. Shoul...,give it a month and it will all go away should...,0.0,0.0
2014-01-23 12:05:09,we're about to get roasted and people might no...,were about to get roasted and people might not...,0.0,0.0
2014-01-26 22:32:51,tK would be needing 1,tk would be needing 1,0.0,0.0


In [20]:
# Create a column to store the text sentiment
tweets_df['sentiment'] = tweets_df['polarity'].apply(getSentiment)

In [21]:
tweets_df.head(10)

Unnamed: 0_level_0,tweet,cleaned_tweets,subjectivity,polarity,sentiment
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-30 17:07:29,Attention ye who have not children: Chooseth w...,attention ye who have not children chooseth wi...,0.9,0.7,Positive
2013-09-05 19:35:25,Unite knows something we dont... something bet...,unite knows something we dont something better...,0.416667,0.375,Positive
2013-10-02 23:17:18,ok i have to ask... why do people favorite twe...,ok i have to ask why do people favorite tweets...,0.75,0.1875,Positive
2013-10-13 16:37:17,M(eth)iley C(rack)yrus,methiley crackyrus,0.0,0.0,Neutral
2013-10-23 20:24:45,get a sled and ride it out homie,get a sled and ride it out homie,0.0,0.0,Neutral
2013-10-23 20:26:11,tell me i didnt just hit methodz with some dee...,tell me i didnt just hit methodz with some dee...,0.6,-0.1,Negative
2013-12-08 17:41:48,. if you get a million ill start smoking,if you get a million ill start smoking,1.0,-0.5,Negative
2014-01-12 23:53:00,give it a month and it will all go away. Shoul...,give it a month and it will all go away should...,0.0,0.0,Neutral
2014-01-23 12:05:09,we're about to get roasted and people might no...,were about to get roasted and people might not...,0.0,0.0,Neutral
2014-01-26 22:32:51,tK would be needing 1,tk would be needing 1,0.0,0.0,Neutral


In [22]:
# sentiment confidence?
# bag of words?
# stemming/lemmatization on tweets?
# get dummies for 

# Easy classifier
# Random forest
# Multinomial NB (Naive Bayes) --> supervised learning that works well with text based data

# final -- create a bot for the buy/sell signal

In [23]:
# Convert the sentiments into numbers for representation
# negative = 0, neutral = 1, positive = 2

sentiment_ordering = ['Negative', 'Neutral', 'Positive']
tweets_df['sentiment_score'] = tweets_df['sentiment'].apply(lambda x: sentiment_ordering.index(x))

# tweets_df.head(10)
tweets_df.tail(10)

Unnamed: 0_level_0,tweet,cleaned_tweets,subjectivity,polarity,sentiment,sentiment_score
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-02-10 15:28:47,"End/ But, to me, $ETH is simply a pipeline and...",end but to me eth is simply a pipeline and lin...,0.714286,0.125,Positive,2
2021-02-10 15:30:00,applications incubator has completed the trans...,applications incubator has completed the trans...,0.0,0.0,Neutral,1
2021-02-10 15:30:52,"Someone just lost $69,261 (~39.48 ETH)! 69,260...",someone just lost 69261 3948 eth 69260502 usdt...,0.5,0.0,Neutral,1
2021-02-10 15:35:14,eth is so done,eth is so done,0.0,0.0,Neutral,1
2021-02-10 15:35:34,Another piece of a piece I’ve been waiting to ...,another piece of a piece i ve been waiting to ...,0.55,0.5,Positive,2
2021-02-10 15:35:45,Someone has a buy order right now for 800K+ $D...,someone has a buy order right now for 800k drg...,0.535714,0.285714,Positive,2
2021-02-10 15:44:25,LAST public service announcement I'll make bef...,last public service announcement ill make befo...,0.505556,-0.1875,Negative,0
2021-02-10 15:44:25,It is ~1/100th the value of $eth. I won't ment...,it is 1100th the value of eth i wont mention i...,0.533333,0.0,Neutral,1
2021-02-10 15:46:05,"Complaints about $AVAX, a 6 month(!!) old proj...",complaints about avax a 6 month old project ha...,0.53125,0.0625,Positive,2
2021-02-10 15:48:27,One of the reasons that $ORN is a magnificent ...,one of the reasons that orn is a magnificent p...,0.758333,0.616667,Positive,2


In [None]:
# cv = CountVectorizer(max_features=3000)
# X = cv.fit_transform(tweets_df['cleaned_tweets']).toarray()
# y = tweets_df[['sentiment_score']]

In [None]:
# model = MultinomialNB()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# model.fit(X_train, y_train)

In [None]:
# y_pred = model.predict(X_test)

In [None]:
# cf = classification_report(y_test, y_pred)
# print(cf)

In [None]:
# cf_acc = accuracy_score(y_pred, y_test)
# print("Test Accuracy: {:.2f}%".format(cf_acc*100))