# Collecting Twitter Sentiment Data

### Import Libraries & Setup Working Directories

<p><font size="+2" color="red"> ! Runs for cca. 10-15 mins ! ... </font></p>

In [1]:
import pandas as pd
import os
import pathlib
from scipy import stats
from textblob import TextBlob #Textblob is built on NLTK and pattern (e.g. for the sentiment analysis)
from textblob.sentiments import NaiveBayesAnalyzer

In [2]:
# Setup the project directory
workdir = pathlib.Path().absolute()
project_dir = workdir.parent
int_data_path = f"{project_dir}\\00_Data\\04_Twitter_Data\\"
ext_data_path = f"{project_dir}\\00_Data\\"
print(int_data_path)
print(ext_data_path)

C:\Users\akosr\CAS_DAENG\Modul_02\crypto_project_ML\00_Data\04_Twitter_Data\
C:\Users\akosr\CAS_DAENG\Modul_02\crypto_project_ML\00_Data\


### Define transformer functions

In [3]:
def apply_blob(sentence):
    temp = TextBlob(sentence).sentiment[0]
    if temp == 0.0:
        return 0.0 # Neutral
    elif temp >= 0.0:
        return 1.0 # Positive
    else:
        return -1.0 # Negative

In [4]:
def col_encode(col):
    if col>0:
        return 1.0
    elif col<0:
        return -1.0
    else:
        return 0.0

In [5]:
def col_encode_v2(col):
    if col>0.1:
        return 1.0
    elif col<0:
        return -1.0
    else:
        return 0.0

## Elon Musk Tweets between 2014 - 2021

### Collect Data

In [6]:
df_elon = pd.read_excel(int_data_path+'Elon_Tweets_2014_2021.xlsx', sheet_name="bitcoin_elonuser")

In [7]:
df_elon.shape

(12264, 7)

In [8]:
df_elon.head()

Unnamed: 0,id,user_id,username,tweet,language,likes_count,Datetime
0,1406450046472622080,44196397,elonmusk,@teslaownersSV Yeah,en,13033,2021-06-20 05:13:16
1,1406073484300591104,44196397,elonmusk,@weddleandsons @Tesla Nice work,en,16615,2021-06-19 04:16:56
2,1406071203333251072,44196397,elonmusk,@RationalEtienne 🤣🤣,und,14705,2021-06-19 04:07:53
3,1406030454499594240,44196397,elonmusk,@udonandtempura @Tesla Haha pretty much,en,3714,2021-06-19 01:25:57
4,1406030294621167616,44196397,elonmusk,@Erdayastronaut This is epic. Very impressive ...,en,16149,2021-06-19 01:25:19


### Derive sentiment using TextBlob

In [9]:
df_elon['polarity_score'] = df_elon['tweet'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)

In [10]:
df_elon['subjectivity_score'] = df_elon['tweet'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

In [11]:
df_elon['sentiment_score'] = df_elon['tweet'].apply(lambda tweet: apply_blob(tweet))

In [12]:
df_elon.tail()

Unnamed: 0,id,user_id,username,tweet,language,likes_count,Datetime,polarity_score,subjectivity_score,sentiment_score
12259,427371574451507200,44196397,elonmusk,Tesla Supercharger network now energized from ...,en,648,2014-01-26 10:24:58,0.170455,0.454545,1.0
12260,426454433916940288,44196397,elonmusk,Tesla policy is to charge the same price (+ ta...,en,532,2014-01-23 21:40:34,0.0,0.125,0.0
12261,420353814579802112,44196397,elonmusk,Rough cut of Falcon 9 Thaicom flight http://t...,en,286,2014-01-07 01:38:53,-0.1,0.4,-1.0
12262,420330200186880000,44196397,elonmusk,@westcoastbill Thanks Bill! Sure is a great st...,en,14,2014-01-07 00:05:03,0.5125,0.709722,1.0
12263,420269918005628928,44196397,elonmusk,Preparing to launch a Thaicom satellite to geo...,en,421,2014-01-06 20:05:31,0.0,0.0,0.0


In [13]:
df_elon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12264 entries, 0 to 12263
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  12264 non-null  int64         
 1   user_id             12264 non-null  int64         
 2   username            12264 non-null  object        
 3   tweet               12264 non-null  object        
 4   language            12264 non-null  object        
 5   likes_count         12264 non-null  int64         
 6   Datetime            12264 non-null  datetime64[ns]
 7   polarity_score      12264 non-null  float64       
 8   subjectivity_score  12264 non-null  float64       
 9   sentiment_score     12264 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(3), object(3)
memory usage: 958.2+ KB


In [14]:
df_elon.isna().sum()

id                    0
user_id               0
username              0
tweet                 0
language              0
likes_count           0
Datetime              0
polarity_score        0
subjectivity_score    0
sentiment_score       0
dtype: int64

### Clean the Data and prepare for merge

#### Transforming to Daily Scores, using mean

In [15]:
df_elon_daily = df_elon.loc[:, ['Datetime', 'polarity_score']]

In [16]:
df_elon_daily['Date'] = pd.to_datetime(df_elon_daily['Datetime']).dt.date

In [17]:
df_elon_daily = df_elon_daily.groupby('Date').agg({'polarity_score': 'median'})

In [18]:
# Encoding to 1-0-(-1)
df_elon_daily['sentiment_score'] = df_elon_daily['polarity_score'].apply(lambda col: col_encode(col))

In [19]:
df_elon_daily['sentiment_score'].value_counts()

 1.0    871
 0.0    814
-1.0    112
Name: sentiment_score, dtype: int64

In [20]:
df_elon_daily.reset_index(inplace=True)

In [21]:
df_elon_daily.shape

(1797, 3)

#### Transforming to daily scores, using mode

In [22]:
df_elon_daily_mod = df_elon.loc[:, ['Datetime', 'polarity_score']]

In [23]:
df_elon_daily_mod['Date'] = pd.to_datetime(df_elon_daily_mod['Datetime']).dt.date

In [24]:
df_elon_daily_mod = pd.DataFrame(df_elon_daily_mod.groupby('Date').agg(pd.Series.mode))

In [25]:
df_elon_daily_mod = df_elon_daily_mod.explode('polarity_score')

In [26]:
# Encoding to 1-0-(-1)
df_elon_daily_mod['sentiment_score'] = df_elon_daily_mod['polarity_score'].apply(lambda col: col_encode(col))

In [27]:
df_elon_daily_mod['sentiment_score'].value_counts()

 0.0    1376
 1.0    1129
-1.0     310
Name: sentiment_score, dtype: int64

In [28]:
df_elon_daily_mod.shape

(2815, 3)

### Save to file & check

In [29]:
df_elon_daily.to_csv(ext_data_path+'elon_tweet_sentiment.csv', index=False)

In [30]:
# Load back & check
df_elon_check = pd.read_csv(ext_data_path+'elon_tweet_sentiment.csv')

In [31]:
df_elon_check.head()

Unnamed: 0,Date,polarity_score,sentiment_score
0,2014-01-06,0.0,0.0
1,2014-01-07,0.20625,1.0
2,2014-01-23,0.0,0.0
3,2014-01-26,0.0,0.0
4,2014-01-30,0.0,0.0


In [32]:
df_elon_check['sentiment_score'].value_counts()

 1.0    871
 0.0    814
-1.0    112
Name: sentiment_score, dtype: int64

## Bitcoin-related Tweets from verified users between 2014 - 2021

In [33]:
df_bitcoin = pd.read_excel(int_data_path+'Bitcoin_Tweets_2014_2021.xlsx', sheet_name="bitcoin_tweets_verified")

In [34]:
df_bitcoin.head()

Unnamed: 0,id,user_id,username,tweet,language,likes_count,Datetime
0,1406638837032439808,34713362,business,Bitcoin dropped over the weekend amid a focus ...,en,15,2021-06-20 17:43:27
1,1406638325595844608,14104994,nskinsella,@SUSY_Sugra ha. right. Bitcoin fixes libertari...,en,0,2021-06-20 17:41:25
2,1406636435617161216,17673635,livemint,Bitcoin is trading at about half its record hi...,en,5,2021-06-20 17:33:54
3,1406636226610974720,955744720092835840,wazirxindia,@DINESHC68520388 ^^@WazirXCares,und,0,2021-06-20 17:33:05
4,1406636184323956736,955744720092835840,wazirxindia,@ImAswinn ^^@WazirXCares,und,0,2021-06-20 17:32:55


### Derive sentiment using TextBlob

In [35]:
df_bitcoin['polarity_score'] = df_bitcoin['tweet'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)

In [36]:
df_bitcoin['subjectivity_score'] = df_bitcoin['tweet'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

In [37]:
df_bitcoin['sentiment_score'] = df_bitcoin['tweet'].apply(lambda tweet: apply_blob(tweet))

In [38]:
df_bitcoin.tail()

Unnamed: 0,id,user_id,username,tweet,language,likes_count,Datetime,polarity_score,subjectivity_score,sentiment_score
724813,418181736615649280,18802551,stephantual,Told you 2014 was going to be all about Bitcoi...,en,0,2014-01-01 01:47:50,0.0,0.0,0.0
724814,418179037245104128,1081,davemcclure,".@500Startups: ""The Year in GIFs: 2013 Hi-Tech...",en,5,2014-01-01 01:37:06,0.0,0.0,0.0
724815,418175394718965760,125304737,blockchain,Our CEO Nic Cary was on Let's Talk Bitcoin! ...,en,8,2014-01-01 01:22:38,0.5,1.0,1.0
724816,418174596546703360,22963273,justincaffier,"New Years #Resolutions - HEALTH: Quit using ""...",en,2,2014-01-01 01:19:27,0.043182,0.252273,1.0
724817,418172972092760064,16354208,maxberley,Bitcoin Is a High-Tech Dinosaur Soon to Be Ext...,en,0,2014-01-01 01:13:00,-0.4,0.6,-1.0


In [39]:
df_bitcoin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724818 entries, 0 to 724817
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  724818 non-null  int64         
 1   user_id             724818 non-null  int64         
 2   username            724817 non-null  object        
 3   tweet               724818 non-null  object        
 4   language            724818 non-null  object        
 5   likes_count         724818 non-null  int64         
 6   Datetime            724818 non-null  datetime64[ns]
 7   polarity_score      724818 non-null  float64       
 8   subjectivity_score  724818 non-null  float64       
 9   sentiment_score     724818 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(3), object(3)
memory usage: 55.3+ MB


In [40]:
df_bitcoin.isna().sum()

id                    0
user_id               0
username              1
tweet                 0
language              0
likes_count           0
Datetime              0
polarity_score        0
subjectivity_score    0
sentiment_score       0
dtype: int64

### Clean the Data and prepare for merge

#### Transforming to Daily Scores, using mean

In [41]:
df_bitcoin_daily = df_bitcoin.loc[:, ['Datetime', 'polarity_score']]

In [42]:
df_bitcoin_daily['Date'] = pd.to_datetime(df_bitcoin_daily['Datetime']).dt.date

In [43]:
df_bitcoin_daily = df_bitcoin_daily.groupby('Date').agg({'polarity_score': 'mean'})

In [44]:
# Recode to 1-0-(-1)
df_bitcoin_daily['sentiment_score'] = df_bitcoin_daily['polarity_score'].apply(lambda col: col_encode(col))

In [45]:
df_bitcoin_daily['sentiment_score'].value_counts()

 1.0    2715
-1.0      13
Name: sentiment_score, dtype: int64

In [46]:
df_bitcoin_daily.reset_index(inplace=True)

#### Transforming to Daily Scores, using mode

In [47]:
df_bitcoin_daily_mode = df_bitcoin.loc[:, ['Datetime', 'polarity_score']]

In [48]:
df_bitcoin_daily_mode['Date'] = pd.to_datetime(df_bitcoin_daily_mode['Datetime']).dt.date

In [49]:
df_bitcoin_daily_mode = pd.DataFrame(df_bitcoin_daily_mode.groupby('Date')['polarity_score'].agg(pd.Series.mode))

In [50]:
df_bitcoin_daily_mode = df_bitcoin_daily_mode.explode('polarity_score')

In [51]:
# Encoding to 1-0-(-1)
df_bitcoin_daily_mode['sentiment_score'] = df_bitcoin_daily_mode['polarity_score'].apply(lambda col: col_encode(col))

In [52]:
df_bitcoin_daily_mode['sentiment_score'].value_counts()

0.0    2727
1.0       1
Name: sentiment_score, dtype: int64

### Save to file & check

In [53]:
df_bitcoin_daily.to_csv(ext_data_path+'bitcoin_tweet_sentiment.csv', index=False)

In [54]:
# Load back & check
df_bitcoin_check = pd.read_csv(ext_data_path+'bitcoin_tweet_sentiment.csv')

In [55]:
df_bitcoin_check.head()

Unnamed: 0,Date,polarity_score,sentiment_score
0,2014-01-01,0.120676,1.0
1,2014-01-02,0.096087,1.0
2,2014-01-03,0.103907,1.0
3,2014-01-04,0.103798,1.0
4,2014-01-05,0.111918,1.0


In [56]:
df_bitcoin_check['sentiment_score'].value_counts()

 1.0    2715
-1.0      13
Name: sentiment_score, dtype: int64