## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler

import re

from langdetect import detect

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from PIL import Image
from numpy import asarray
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_eng = stopwords.words('english')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

import seaborn as sb

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tsak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Tsak\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# 1. Load dataset

In [2]:
df_raw = pd.read_csv('bitcoin_tweets_3.csv',sep=';',engine='c')
pd.options.display.max_colwidth = 70
df_raw

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text
0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS https://t.co/1kTKqKEBlS 6500% RO...
2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\nhtt...
...,...,...,...,...,...,...,...,...,...
20165008,1198262135580741633,JacobCanfield,Jacob Canfield,,2019-11-23 15:28:50+00,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,1198266433941233664,Vizique,Vizique,,2019-11-23 15:45:55+00,0.0,0.0,0.0,Bitcoin Suisse Certificates :) https://t.co/ndaBknoTK3
20165010,1198266441293860864,torusJKL,Gal Buki ($torusJKL),,2019-11-23 15:45:56+00,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,1198266442673733633,Adekunl95628158,Adekunle Daniel,,2019-11-23 15:45:57+00,0.0,0.0,0.0,@btc \n@btc \nDo you know that BTC Baskets isn't a bank but a gift...


# 2. Prepare dataset

In [3]:
df = df_raw.drop(columns=['id','url'])
df['timestamp'] = df.timestamp.str[:19]
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df

Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS https://t.co/1kTKqKEBlS 6500% RO...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\nhtt...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :) https://t.co/ndaBknoTK3
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,@btc \n@btc \nDo you know that BTC Baskets isn't a bank but a gift...


# 3. Preprocessing

## 3.1 Remove tweets that are missing info

In [4]:
df = df.dropna()
df

Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS https://t.co/1kTKqKEBlS 6500% RO...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\nhtt...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :) https://t.co/ndaBknoTK3
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,@btc \n@btc \nDo you know that BTC Baskets isn't a bank but a gift...


## 3.2 Remove URLs

In [5]:
df['text'] = df['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)


Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS 6500% ROI; AT&amp;T Bitcoin Bil...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\n\n\...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :)
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,@btc \n@btc \nDo you know that BTC Baskets isn't a bank but a gift...


## 3.3 Remove mentions

In [6]:
df['text'] = df['text'].replace(r'@\S+', '', regex=True)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].replace(r'@\S+', '', regex=True)


Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS 6500% ROI; AT&amp;T Bitcoin Bil...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\n\n\...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :)
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,\n \nDo you know that BTC Baskets isn't a bank but a gifting plat...


## 3.4 Remove empty text tweets

In [7]:
df['is_space'] = df['text'].str.isspace()
df = df[df.is_space == False]
df = df.drop('is_space', axis=1)
df = df[df.text != '']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_space'] = df['text'].str.isspace()


Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS 6500% ROI; AT&amp;T Bitcoin Bil...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\n\n\...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :)
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,\n \nDo you know that BTC Baskets isn't a bank but a gifting plat...


## 3.5 Remove duplicate tweets (bots)

In [8]:
df = df.drop_duplicates(subset=['text'])
df

Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS 6500% ROI; AT&amp;T Bitcoin Bil...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\n\n\...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :)
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,\n \nDo you know that BTC Baskets isn't a bank but a gifting plat...


## 3.6 Remove tweets containing spam words (bots)

In [9]:
df['spam_words'] = df['text'].str.contains('win|free|prize|100%|earn|risk free',case=False)
df = df[df['spam_words'] == False]
df = df.drop('spam_words', axis=1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['spam_words'] = df['text'].str.contains('win|free|prize|100%|earn|risk free',case=False)


Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS 6500% ROI; AT&amp;T Bitcoin Bil...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\n\n\...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :)
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,\n \nDo you know that BTC Baskets isn't a bank but a gifting plat...


## 3.7 Remove tweets that don't contain the words 'btc' or 'bitcoin'

In [18]:
df['contains_keyword'] = df['text'].str.contains('btc|bitcoin',case=False)
df = df[df['contains_keyword'] == True]
df = df.drop('contains_keyword', axis=1)
df

Unnamed: 0,user,fullname,timestamp,replies,likes,retweets,text
0,KamdemAbdiel,Abdiel kamdem,2019-05-27 11:49:14,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT...
1,bitcointe,Bitcointe,2019-05-27 11:49:18,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS 6500% ROI; AT&amp;T Bitcoin Bil...
2,3eyedbran,Bran - 3 Eyed Raven,2019-05-27 11:49:06,0.0,2.0,1.0,Another Test tweet that wasn't caught in the stream ! bitcoin
3,DetroitCrypto,J. Scardina,2019-05-27 11:49:22,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLT...
4,mmursaleen72,Muhammad Mursaleen,2019-05-27 11:49:23,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\n\n\...
...,...,...,...,...,...,...,...
20165008,JacobCanfield,Jacob Canfield,2019-11-23 15:28:50,2.0,16.0,3.0,Happy #FibonacciDay \n\nA while back I created a new set of Fibona...
20165009,Vizique,Vizique,2019-11-23 15:45:55,0.0,0.0,0.0,Bitcoin Suisse Certificates :)
20165010,torusJKL,Gal Buki ($torusJKL),2019-11-23 15:45:56,0.0,0.0,0.0,Register now for the early access of the Codugh API marketplace po...
20165011,Adekunl95628158,Adekunle Daniel,2019-11-23 15:45:57,0.0,0.0,0.0,\n \nDo you know that BTC Baskets isn't a bank but a gifting plat...


# Checkpoint 1

In [19]:
df.to_csv('tweets_btc.csv',index=False)