# Prevendo o preço de Amanhã do Bitcoin

In [None]:
!pip install mwclient transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mwclient
  Downloading mwclient-0.10.1-py2.py3-none-any.whl (27 kB)
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 7.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 46.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 446 kB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: 

Importando o necessário e pegando todas as revisões/comentários da página de Bitcoin no Wikipedia

In [None]:
import mwclient
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

Trazendo a lista de revisões/edições que vêm sempre com algum tipo de comentário

In [None]:
revs = list(page.revisions())

In [None]:
# visualizando o primeiro da lista que veio(a edição feita recentemente - 18/10/2022)
revs[0]

OrderedDict([('revid', 1117934027),
             ('parentid', 1117846981),
             ('user', 'Vgbyp'),
             ('timestamp',
              time.struct_time(tm_year=2022, tm_mon=10, tm_mday=24, tm_hour=9, tm_min=25, tm_sec=26, tm_wday=0, tm_yday=297, tm_isdst=-1)),
             ('comment', '-language, +url-access')])

In [None]:
# organizando a lista de revisões em ordem crescente (a primeira edição/revisão em 2009)
revs = sorted(revs, key=lambda rev: rev["timestamp"])
revs[0] 

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [None]:
# Utilização de uma ferramenta chamada Hugging Face Transformers para análise de sentimento em texto
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

''' 
 Função find_sentiment - colocamos como parâmetro o texto, que o hugging faces irá ler até 250 caracteres
 guardamos o score, e todos os comentários ditos como negativo ficam negativos ex: {"label": "NEGATIVE", "score": 0.9984} -> -0.9984 //score 
'''
def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]

    if sent["label"] == "NEGATIVE":
        score *= -1
    return score 

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
# dicionário com os edits
edits = dict()

'''
  for loop para guardar os sentimentos e o número de edições dentro de um dia
  ex: {'2022-10-21': {'sentiments': [0.9800, -0.1222, 0.89, 0.78], 'edit_count': 4}}
'''
for rev in revs:
    date = time.strftime("%Y-%m-%d", rev["timestamp"])

    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)

    edits[date]["edit_count"] += 1

    if "comment" in rev:
        comment = rev["comment"]
        edits[date]["sentiments"].append(find_sentiment(comment))

In [None]:
from statistics import mean

for date in edits:
    sentiments = edits[date]["sentiments"] # list of sentiments of each date revision
    if len(edits[date]["sentiments"]) > 0:
        edits[date]["mean_sentiment"] = mean(sentiments)
        edits[date]["neg_sentiment"] = len([s for s in sentiments if s < 0]) / edits[date]["edit_count"]
    else:
        edits[date]["mean_sentiment"] = 0
        edits[date]["neg_sentiment"] = 0

    del edits[date]["sentiments"]

In [None]:
edits

{'2009-03-08': {'edit_count': 4,
  'mean_sentiment': -0.5505250096321106,
  'neg_sentiment': 0.75},
 '2009-08-05': {'edit_count': 1,
  'mean_sentiment': 0.7481208443641663,
  'neg_sentiment': 0.0},
 '2009-08-06': {'edit_count': 2,
  'mean_sentiment': 0.995745837688446,
  'neg_sentiment': 0.0},
 '2009-08-14': {'edit_count': 1,
  'mean_sentiment': 0.930020809173584,
  'neg_sentiment': 0.0},
 '2009-10-13': {'edit_count': 2,
  'mean_sentiment': -0.2275007963180542,
  'neg_sentiment': 0.5},
 '2009-11-18': {'edit_count': 1,
  'mean_sentiment': 0.8839504718780518,
  'neg_sentiment': 0.0},
 '2009-12-08': {'edit_count': 1,
  'mean_sentiment': -0.9869275689125061,
  'neg_sentiment': 1.0},
 '2009-12-17': {'edit_count': 1,
  'mean_sentiment': -0.9975171089172363,
  'neg_sentiment': 1.0},
 '2010-02-23': {'edit_count': 1,
  'mean_sentiment': -0.9994946718215942,
  'neg_sentiment': 1.0},
 '2010-03-18': {'edit_count': 1,
  'mean_sentiment': 0.8758771419525146,
  'neg_sentiment': 0.0},
 '2010-04-13': {

In [None]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")
edits_df.head()

Unnamed: 0,edit_count,mean_sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-08-05,1,0.748121,0.0
2009-08-06,2,0.995746,0.0
2009-08-14,1,0.930021,0.0
2009-10-13,2,-0.227501,0.5


In [None]:
# transformando nossas datas(ano/mes/dia) para objeto datetime(melhor manipulação)
edits_df.index = pd.to_datetime(edits_df.index)

In [None]:
from datetime import datetime 

dates = pd.date_range(start="2009-03-08", end=datetime.today())
dates

DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2022-10-15', '2022-10-16', '2022-10-17', '2022-10-18',
               '2022-10-19', '2022-10-20', '2022-10-21', '2022-10-22',
               '2022-10-23', '2022-10-24'],
              dtype='datetime64[ns]', length=4979, freq='D')

In [None]:
edits_df = edits_df.reindex(dates, fill_value=0)
edits_df

Unnamed: 0,edit_count,mean_sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2022-10-20,0,0.000000,0.00
2022-10-21,0,0.000000,0.00
2022-10-22,0,0.000000,0.00
2022-10-23,3,0.911091,0.00


In [None]:
rolling_edits = edits_df.rolling(30).mean()
rolling_edits


Unnamed: 0,edit_count,mean_sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2022-10-20,0.866667,-0.095021,0.224444
2022-10-21,0.700000,-0.097511,0.211111
2022-10-22,0.600000,-0.064256,0.177778
2022-10-23,0.700000,-0.033886,0.177778


In [None]:
rolling_edits = rolling_edits.dropna()

In [None]:
rolling_edits.to_csv("wikipedia_rolling_mean_edits.csv")

In [None]:
rolling_edits

Unnamed: 0,edit_count,mean_sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2022-10-20,0.866667,-0.095021,0.224444
2022-10-21,0.700000,-0.097511,0.211111
2022-10-22,0.600000,-0.064256,0.177778
2022-10-23,0.700000,-0.033886,0.177778
