# Split the clean.csv file into multiple files. Compute VADER sentiment and score

In [76]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm

# Define the currency
#CURRENCY = "zilliqa"
#CURRENCY_SYMBOL = "ZIL"
#CURRENCY = "nexo"
#CURRENCY_SYMBOL = "NEXO"
CURRENCY = "bitcoin"
CURRENCY_SYMBOL = "BTC"
tweets_clean_file = f'data/twitter/{CURRENCY_SYMBOL}/{CURRENCY}_tweets_clean.csv'
path = f'data/twitter/{CURRENCY_SYMBOL}' #/{CURRENCY}_tweets_clean.csv'


## Read the cleaned file

In [77]:
df_clean = pd.read_csv(tweets_clean_file)
print(df_clean.shape)
df_clean.head(5)

(4003, 7)


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt
0,1359455535255285769,RT : Don't trust banks. Always buy Bitcoin.,Raul,74,2,0,Wed Feb 10 10:53:51 +0000 2021
1,1359455532055199744,$aave is not done yet...the supply is so low w...,Cryptopassion,35,0,0,Wed Feb 10 10:53:50 +0000 2021
2,1359455528238387200,RT : 🔥TOTAL CRYPTO MARKETCAP BREAK OUT🔥🔷 This ...,Chad,14,105,0,Wed Feb 10 10:53:50 +0000 2021
3,1359455523704217601,My ETH 0x65862B695E11058884058a17C33bFCCFc6b1b...,Soni,32,0,0,Wed Feb 10 10:53:48 +0000 2021
4,1359455523385524224,RT : SNAKE Token Airdrop Live!!! Airdrop 2nd ...,Malibu_Mo,9,4716,0,Wed Feb 10 10:53:48 +0000 2021


In [78]:
df_clean = df_clean.sort_values(by='ID') # the bigger the ID, the most recent the tweet 

## Sentiment analysis with Vader

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media.

VADER takes into account 
- negations and contractions (not good, wasn’t good)
- Ponctuation (good!!!), CAPS, emotes :), emojis 
- Intensificators (very, kind of), acronyms ‘lol’
- Scores between -1.0 (negative) and 1.0 (positive)

We will use this sentiment analysis of the tweets to calculate a score that will represent the importance of each tweet.

In [79]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(df_clean['Text'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
df_clean["compound"] = compound
df_clean.head(100)

100%|██████████| 4003/4003 [00:00<00:00, 8481.78it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound
997,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0000
2998,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0000
3999,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0000
1997,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0000
996,1359452756755402754,Ideaology has listed on Uniswap! $IDEA /wcEisv...,Rakib Shakot,546,0,0,Wed Feb 10 10:42:49 +0000 2021,0.2714
...,...,...,...,...,...,...,...,...
3976,1359452819523203081,Now CT shilling $OMG LOL We are holding since ...,ｃｒｙｐ_ｔｏ_ｍａｒｓ| 🅴🆃🅷 💚,815,0,0,Wed Feb 10 10:43:04 +0000 2021,0.5473
973,1359452819858800643,Tesla Invests $1.5 Billion in Bitcoin: Price R...,PaulCrypto.com,4811,0,0,Wed Feb 10 10:43:04 +0000 2021,0.0000
1973,1359452819858800643,Tesla Invests $1.5 Billion in Bitcoin: Price R...,PaulCrypto.com,4811,0,0,Wed Feb 10 10:43:04 +0000 2021,0.0000
2974,1359452819858800643,Tesla Invests $1.5 Billion in Bitcoin: Price R...,PaulCrypto.com,4811,0,0,Wed Feb 10 10:43:04 +0000 2021,0.0000


In [80]:
df_clean[df_clean['compound']<0]

Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound
2984,1359452798103023617,RT : Early investors in Bitcoin (a few miners ...,Tether Ponzi Awareness (TPA),813,3,0,Wed Feb 10 10:42:59 +0000 2021,-0.2732
3985,1359452798103023617,RT : Early investors in Bitcoin (a few miners ...,Tether Ponzi Awareness (TPA),813,3,0,Wed Feb 10 10:42:59 +0000 2021,-0.2732
983,1359452798103023617,RT : Early investors in Bitcoin (a few miners ...,Tether Ponzi Awareness (TPA),813,3,0,Wed Feb 10 10:42:59 +0000 2021,-0.2732
1983,1359452798103023617,RT : Early investors in Bitcoin (a few miners ...,Tether Ponzi Awareness (TPA),813,3,0,Wed Feb 10 10:42:59 +0000 2021,-0.2732
982,1359452798564327424,RT : In the bottoms of Bitcoin bear markets th...,Bilal Sharif,10,13,0,Wed Feb 10 10:42:59 +0000 2021,-0.2960
...,...,...,...,...,...,...,...,...
1,1359455532055199744,$aave is not done yet...the supply is so low w...,Cryptopassion,35,0,0,Wed Feb 10 10:53:50 +0000 2021,-0.5009
0,1359455535255285769,RT : Don't trust banks. Always buy Bitcoin.,Raul,74,2,0,Wed Feb 10 10:53:51 +0000 2021,-0.4023
2001,1359455535255285769,RT : Don't trust banks. Always buy Bitcoin.,Raul,74,2,0,Wed Feb 10 10:53:51 +0000 2021,-0.4023
1000,1359455535255285769,RT : Don't trust banks. Always buy Bitcoin.,Raul,74,2,0,Wed Feb 10 10:53:51 +0000 2021,-0.4023


## Calculate a score for each tweet

To calculate the score for each tweet, we use different variables to which we had a weight based on its importance.

The compound column represents the sentiment of the tweets and its value is between -1 and 1.

We also use the number of retweets, the number of likes, and the number of users that follow the tweet's author.

In [81]:
scores = []
for i, s in tqdm(df_clean.iterrows(), total=df_clean.shape[0]):
    
    if(s["UserFollowerCount"]== 'UserFollowerCount'):
        scores.append(0)
    else:
        scores.append(s["compound"] * ((int(s["UserFollowerCount"])+1)) * ((int(s["Likes"])+1)))
df_clean["score"] = scores
df_clean.head(6)

100%|██████████| 4003/4003 [00:00<00:00, 8475.70it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound,score
997,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0,0.0
2998,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0,0.0
3999,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0,0.0
1997,1359452740607332354,RT : We continued to work with global industry...,jejes🐰🐰,83,942,0,Wed Feb 10 10:42:45 +0000 2021,0.0,0.0
996,1359452756755402754,Ideaology has listed on Uniswap! $IDEA /wcEisv...,Rakib Shakot,546,0,0,Wed Feb 10 10:42:49 +0000 2021,0.2714,148.4558
2997,1359452756755402754,Ideaology has listed on Uniswap! $IDEA /wcEisv...,Rakib Shakot,546,0,0,Wed Feb 10 10:42:49 +0000 2021,0.2714,148.4558


## Split dataframe and save it into multiple files

In [82]:
from datetime import datetime

In [90]:
df_clean = df_clean[df_clean['CreatedAt']!='CreatedAt']

In [91]:
n = 20000  #chunk row size
chunks_df = [df_clean[i:i+n] for i in range(0,df_clean.shape[0],n)]

sep_char = '~'
for chunk_df in chunks_df:
    date_from =0
    date_to = 0
    chunk_min = chunk_df['ID'].min()
    chunk_max = chunk_df['ID'].max()
    
    date_from = (datetime.strptime(chunk_df.iloc[0]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    date_to = (datetime.strptime(chunk_df.iloc[-1]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    print(date_from, date_to)

    # Write into csv
    chunk_df.to_csv(f"{path}/{date_from}{sep_char}{date_to}.csv", header=True, index=False)
    


2021-02-10 10-42-45 2021-02-10 10-53-51


## Update var.csv

In [92]:
import glob
import numpy as np

ENVS = ['CRYPTO', 'LINE_COUNT', 'MOST_RECENT_FILE', 'MOST_RECENT_ID'] # Stored in var.csv

def get_var(key, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',',
                         dtype={'LINE_COUNT': np.int32})
    return df_var[key].loc[df_var['CRYPTO'] == crypto].values[0]

def update_var(key, value, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',',
                         dtype={'LINE_COUNT': np.int32})
    df_var[key].loc[df_var['CRYPTO'] == crypto] = str(value)
    df_var.to_csv("var.csv", index=False)
    
def add_new_crypto(crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',',
                         dtype={'LINE_COUNT': np.int32})
    if df_var[ENVS[0]].loc[df_var['CRYPTO'] == crypto].empty:
        new_line = pd.DataFrame([[crypto,-1,"",0]], columns=ENVS)
        df_var = df_var.append(new_line)
        df_var.to_csv("data/twitter/var.csv", index=False)

In [93]:
files = glob.glob(f"{path}/*~*.csv")
files = sorted(files)
last_file = files[-1]
print(files)
last_df = pd.read_csv(last_file)
last_elem = last_df.tail(1)
print(last_elem['ID'])
print(last_df.shape)

add_new_crypto(CURRENCY_SYMBOL)
update_var(ENVS[1], last_df.shape[0], CURRENCY_SYMBOL)
update_var(ENVS[2], last_file, CURRENCY_SYMBOL)
update_var(ENVS[3], last_elem, CURRENCY_SYMBOL)

['data/twitter/BTC/2021-02-10 10-42-45~2021-02-10 10-53-51.csv']
3991    1359455535255285769
Name: ID, dtype: int64
(3992, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [94]:
var_df = pd.read_csv('data/twitter/var.csv')
var_df

Unnamed: 0,CRYPTO,LINE_COUNT,MOST_RECENT_FILE,MOST_RECENT_ID
0,BTC,556,data/twitter/BTC/2018-05-29 12-20-53~2018-05-2...,1001439557504692224
1,NEXO,2576,data/twitter/NEXO/2018-05-19 14-16-32~2018-05-...,1001426024406634497
2,ZIL,14917,data/twitter/ZIL/2018-05-19 13-38-38~2018-05-2...,1001428255268917250
