In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter
from wordcloud import WordCloud

%matplotlib inline

# Code to download corpora
import nltk
nltk.download('stopwords')
nltk.download('reuters')
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aafza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\aafza\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aafza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [18]:
from time import time
import pandas as pd
import numpy as np
import re
import sys
import csv

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

In [19]:
# Reading the dataset with no columns titles and with latin encoding 
df_raw = pd.read_csv('tweets.csv', delimiter=';', skiprows=0, lineterminator='\n' )
df_raw.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text\r
0,1.132977e+18,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0,0,0,È appena uscito un nuovo video! LES CRYPTOMONN...
1,1.132977e+18,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0,0,0,Cardano: Digitize Currencies; EOS https://t.co...
2,1.132977e+18,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0,2,1,Another Test tweet that wasn't caught in the s...
3,1.132977e+18,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0,0,0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,1.132977e+18,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0,0,0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [20]:
# Rename columns
df_raw.columns = ["id", "user", "fullname", "url", "timestamp", "replies","likes","retweets","text"]
df_raw.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text
0,1.132977e+18,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0,0,0,È appena uscito un nuovo video! LES CRYPTOMONN...
1,1.132977e+18,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0,0,0,Cardano: Digitize Currencies; EOS https://t.co...
2,1.132977e+18,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0,2,1,Another Test tweet that wasn't caught in the s...
3,1.132977e+18,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0,0,0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,1.132977e+18,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0,0,0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [21]:
# Ommiting every column except for the text and the date, as we won't need any of the other information
df = df_raw[['timestamp','text']]
df.sample(5)

Unnamed: 0,timestamp,text
11115259,2019-08-22 08:35:04+00,#orionixtoken #blockchain #ethereum #bitcoin #...
7215003,2019-06-30 20:01:02+00,btc rate chart - Binance https://t.co/Jo2LSNJ3...
334401,2019-05-13 19:29:59+00,Bitcoin just broke above $8k\r
1399598,2019-05-27 07:32:08+00,Claims of ‘51% Attack’ on Bitcoin Cash Sparks ...
1889571,2015-03-03 19:45:05+00,"1 #bitcoin 692.99 TL, 275.989 $, 241.200 €, 18..."


In [22]:
df = df [~df['text'].str.contains(r'[^\x00-\x7F^a-zA-Z]+')]
print(df)

                       timestamp  \
2         2019-05-27 11:49:06+00   
3         2019-05-27 11:49:22+00   
4         2019-05-27 11:49:23+00   
6         2019-05-27 11:49:25+00   
9         2019-05-27 11:49:32+00   
...                          ...   
16889760  2019-11-23 15:28:50+00   
16889761  2019-11-23 15:45:55+00   
16889762  2019-11-23 15:45:56+00   
16889763  2019-11-23 15:45:57+00   
16889764  2019-11-23 15:45:06+00   

                                                       text  
2         Another Test tweet that wasn't caught in the s...  
3         Current Crypto Prices! \n\nBTC: $8721.99 USD\n...  
4         Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...  
6         @nwoodfine We have been building on the real #...  
9         CHANGE IS COMING...GET READY!!! Boom, Another ...  
...                                                     ...  
16889760  Happy #FibonacciDay \n\nA while back I created...  
16889761  Bitcoin Suisse Certificates :) https://t.co/nd...  
16889762 

In [23]:
df['date'] = pd.to_datetime(df['timestamp'],format= '%Y-%m-%d').dt.date
df['date']

2           2019-05-27
3           2019-05-27
4           2019-05-27
6           2019-05-27
9           2019-05-27
               ...    
16889760    2019-11-23
16889761    2019-11-23
16889762    2019-11-23
16889763    2019-11-23
16889764    2019-11-23
Name: date, Length: 9061471, dtype: object

In [24]:
df = df.sort_values(by='date')
df.head()

Unnamed: 0,timestamp,text,date
11291156,2007-04-19 07:14:38+00,is happily mugging at BTC where she will hook ...,2007-04-19
21,2009-01-11 03:33:52+00,Running bitcoin\r,2009-01-11
5164467,2009-01-21 17:29:40+00,Looking at ways to add more anonymity to bitco...,2009-01-21
5210398,2009-01-27 20:14:10+00,Thinking about how to reduce CO2 emissions fro...,2009-01-27
5269667,2009-01-29 13:37:53+00,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,2009-01-29


In [25]:
df = df.set_index('date')
df.head()

Unnamed: 0_level_0,timestamp,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2007-04-19,2007-04-19 07:14:38+00,is happily mugging at BTC where she will hook ...
2009-01-11,2009-01-11 03:33:52+00,Running bitcoin\r
2009-01-21,2009-01-21 17:29:40+00,Looking at ways to add more anonymity to bitco...
2009-01-27,2009-01-27 20:14:10+00,Thinking about how to reduce CO2 emissions fro...
2009-01-29,2009-01-29 13:37:53+00,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...


In [26]:
# df.date.year > '2017'
startdate = pd.to_datetime("2017-01-01").date()
df = df.loc[startdate:]
df.head()

Unnamed: 0_level_0,timestamp,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,2017-01-01 10:00:08+00,1 KOBO = 0.00000199 BTC \n = 0.0019 USD \n = 0...
2017-01-01,2017-01-01 17:30:05+00,One Bitcoin now worth $995.13@bitstamp. High $...
2017-01-01,2017-01-01 18:15:02+00,$994.54 at 19:15 UTC [24h Range: $956.63 - $99...
2017-01-01,2017-01-01 18:45:03+00,$994.77 at 19:45 UTC [24h Range: $956.63 - $99...
2017-01-01,2017-01-01 20:15:10+00,$1000.00 at 21:15 UTC [24h Range: $960.53 - $1...


In [27]:
df.isnull().sum()

timestamp    0
text         0
dtype: int64

In [28]:
df.dropna(how='any', inplace=True)

In [29]:
df.sample(5)

Unnamed: 0_level_0,timestamp,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-07-17,2019-07-17 23:44:33+00,amazing closing notes at todays #libra hearing...
2019-11-09,2019-11-09 00:29:32+00,Nice find! And even more reason to be a BTC Ma...
2019-11-01,2019-11-01 22:07:02+00,This #ai is better at #starcraft #ii than you'...
2019-08-10,2019-08-10 19:16:23+00,@ltcoinwhisperer This reasoning is what the pe...
2019-11-16,2019-11-16 04:35:09+00,"@playmatestweets Hello everyone, I bring you a..."


In [30]:
df.to_csv('bitcoin_tweets_4.csv')