In [146]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import yfinance as yf
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler


In [147]:
data = pd.read_csv('twitter_BTC_Users_3Months_2021.csv')

## Formatando as mensagens

In [148]:
import re
import emoji
import nltk 
words = set(nltk.corpus.words.words())

def cleaner(text):
    text = re.sub("@[A-Za-z0-9]+","",text) #Remove @ sign
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) #Remove http links
    text = " ".join(text.split())
    text = ''.join(c for c in text if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    text = text.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) \
         if w.lower() in words or not w.isalpha())
    return text
data['text'] = data['text'].map(lambda x: cleaner(x))
data['date'] = pd.to_datetime(data['date']).dt.date

## Análise vader

In [149]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

scores = []
# Declare variables for scores
compound_list = []
positive_list = []
negative_list = []
neutral_list = []
for i in range(data['text'].shape[0]):
#print(analyser.polarity_scores(sentiments_pd['text'][i]))
    compound = analyzer.polarity_scores(data['text'][i])["compound"]
    pos = analyzer.polarity_scores(data['text'][i])["pos"]
    neu = analyzer.polarity_scores(data['text'][i])["neu"]
    neg = analyzer.polarity_scores(data['text'][i])["neg"]
    
    scores.append({"Compound": compound,
                       "Positive": pos,
                       "Negative": neg,
                       "Neutral": neu
                  }) 
sentiments_score = pd.DataFrame.from_dict(scores)
df_sentiment = data.join(sentiments_score)

## Removendo colunas desnecessárias e mudando outras

In [150]:
df_sentiment.index = range(df_sentiment.shape[0])
df_sentiment.columns.name = 'Id'
df_sentiment.drop(columns =['username', 'followers', 'Unnamed: 0', 'id'], inplace = True)

data = df_sentiment.groupby(['date']).agg(['mean','count'])
data.columns = [ ' '.join(str(i) for i in col) for col in data.columns]
data.reset_index(inplace=True)
data = data[:-1]
data

Unnamed: 0,date,favorites mean,favorites count,retweets mean,retweets count,Compound mean,Compound count,Positive mean,Positive count,Negative mean,Negative count,Neutral mean,Neutral count
0,2021-07-01,233.714286,21,35.333333,21,-0.001395,21,0.023952,21,0.026571,21,0.949429,21
1,2021-07-02,163.846154,13,21.153846,13,0.178592,13,0.060692,13,0.013308,13,0.926000,13
2,2021-07-03,207.900000,10,29.500000,10,0.071900,10,0.083300,10,0.010400,10,0.906300,10
3,2021-07-04,273.750000,4,41.250000,4,0.079550,4,0.019000,4,0.000000,4,0.981000,4
4,2021-07-05,255.650000,20,48.100000,20,0.191020,20,0.085850,20,0.044100,20,0.870100,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,2021-10-25,127.000000,54,20.481481,54,0.064531,54,0.038833,54,0.027481,54,0.933630,54
117,2021-10-26,169.714286,42,21.452381,42,-0.031840,42,0.034548,42,0.032833,42,0.932667,42
118,2021-10-27,263.517241,29,33.068966,29,-0.036252,29,0.021552,29,0.034483,29,0.944034,29
119,2021-10-28,313.937500,48,46.312500,48,0.030342,48,0.031021,48,0.025354,48,0.943625,48


## Acessando BTC

In [151]:
moeda = ['BTC-USD']
btc = yf.download(moeda, start='2021-07-01',
                end='2021-10-30')
btc = btc[1:]
btc.index = range(btc.shape[0])
btc.columns.name = 'Id'

btc

[*********************100%***********************]  1 of 1 completed


Id,Open,High,Low,Close,Adj Close,Volume
0,33549.601562,33939.589844,32770.679688,33897.046875,33897.046875,38728974942
1,33854.421875,34909.261719,33402.695312,34668.546875,34668.546875,24383958643
2,34665.566406,35937.566406,34396.476562,35287.781250,35287.781250,24924307911
3,35284.343750,35284.343750,33213.660156,33746.003906,33746.003906,26721554282
4,33723.507812,35038.535156,33599.917969,34235.195312,34235.195312,26501259870
...,...,...,...,...,...,...
116,63032.761719,63229.027344,59991.160156,60363.792969,60363.792969,34878965587
117,60352.000000,61435.183594,58208.187500,58482.386719,58482.386719,43657076893
118,58470.730469,62128.632812,58206.917969,60622.136719,60622.136719,45257083247
119,60624.871094,62927.609375,60329.964844,62227.964844,62227.964844,36856881767


## Juntando btc com twitter

In [152]:
df_btc_tweets = data.join(btc)
df_btc_tweets.drop(columns =['date'], inplace = True)

In [153]:
df_btc_tweets

Unnamed: 0,favorites mean,favorites count,retweets mean,retweets count,Compound mean,Compound count,Positive mean,Positive count,Negative mean,Negative count,Neutral mean,Neutral count,Open,High,Low,Close,Adj Close,Volume
0,233.714286,21,35.333333,21,-0.001395,21,0.023952,21,0.026571,21,0.949429,21,33549.601562,33939.589844,32770.679688,33897.046875,33897.046875,38728974942
1,163.846154,13,21.153846,13,0.178592,13,0.060692,13,0.013308,13,0.926000,13,33854.421875,34909.261719,33402.695312,34668.546875,34668.546875,24383958643
2,207.900000,10,29.500000,10,0.071900,10,0.083300,10,0.010400,10,0.906300,10,34665.566406,35937.566406,34396.476562,35287.781250,35287.781250,24924307911
3,273.750000,4,41.250000,4,0.079550,4,0.019000,4,0.000000,4,0.981000,4,35284.343750,35284.343750,33213.660156,33746.003906,33746.003906,26721554282
4,255.650000,20,48.100000,20,0.191020,20,0.085850,20,0.044100,20,0.870100,20,33723.507812,35038.535156,33599.917969,34235.195312,34235.195312,26501259870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,127.000000,54,20.481481,54,0.064531,54,0.038833,54,0.027481,54,0.933630,54,63032.761719,63229.027344,59991.160156,60363.792969,60363.792969,34878965587
117,169.714286,42,21.452381,42,-0.031840,42,0.034548,42,0.032833,42,0.932667,42,60352.000000,61435.183594,58208.187500,58482.386719,58482.386719,43657076893
118,263.517241,29,33.068966,29,-0.036252,29,0.021552,29,0.034483,29,0.944034,29,58470.730469,62128.632812,58206.917969,60622.136719,60622.136719,45257083247
119,313.937500,48,46.312500,48,0.030342,48,0.031021,48,0.025354,48,0.943625,48,60624.871094,62927.609375,60329.964844,62227.964844,62227.964844,36856881767


## Criando regressor

In [154]:
regressor = Sequential()

regressor.add(Dense(8, input_dim=1, kernel_initializer='random_uniform',
                    activation='sigmoid', use_bias=False))
regressor.add(Dense(8, kernel_initializer='random_uniform', 
                    activation='sigmoid', use_bias=False))

regressor.add(Dense(1, kernel_initializer='random_uniform',
                    activation='linear', use_bias=False))

regressor.compile(loss='mean_squared_error', optimizer='adam')

#regressor.summary()

## Escalando dados

In [155]:
s_scaler = StandardScaler()

s_scaler.fit(df_btc_tweets)

data_escalada = s_scaler.transform(df_btc_tweets)

### Prox passo, descobrir como associar com exemplo "Loading the numerical and categorical data" do site https://pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/