# Taking Data From Wikipedia

In [None]:
pip install mwclient

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import mwclient
import time

site=mwclient.Site("en.wikipedia.org")
page=site.pages["Bitcoin"]

In [None]:
revs=list(page.revisions())

In [None]:
revs=sorted(revs,key=lambda rev: rev["timestamp"])

In [None]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

# Using Pre-trained Sentiment analysis Model

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import pipeline
sentiment_pipeline=pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
def find_senti(text):

  sent=sentiment_pipeline([text[:250]])[0]
  score=sent["score"]

  if sent["label"]=="NEGATIVE":
    score=score*-1
    
  return score

# Check Sentiments

In [None]:
find_senti("i hate u")

-0.999397873878479

In [None]:
find_senti("i love u")

0.9998582601547241

# Making Dictionary with Date and corresponding edits and its Sentiment

In [None]:
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_senti(comment))

In [None]:
edits

{'2009-03-08': {'sentiments': [-0.9905919432640076,
   0.748120903968811,
   -0.9907428622245789,
   -0.9688861966133118],
  'edit_count': 4},
 '2009-08-05': {'sentiments': [0.748120903968811], 'edit_count': 1},
 '2009-08-06': {'sentiments': [0.9957457184791565, 0.9957457184791565],
  'edit_count': 2},
 '2009-08-14': {'sentiments': [0.9300208687782288], 'edit_count': 1},
 '2009-10-13': {'sentiments': [0.5404348969459534, -0.9954361319541931],
  'edit_count': 2},
 '2009-11-18': {'sentiments': [0.8839504718780518], 'edit_count': 1},
 '2009-12-08': {'sentiments': [-0.9869275689125061], 'edit_count': 1},
 '2009-12-17': {'sentiments': [-0.9975171089172363], 'edit_count': 1},
 '2010-02-23': {'sentiments': [-0.9994946718215942], 'edit_count': 1},
 '2010-03-18': {'sentiments': [0.8758770227432251], 'edit_count': 1},
 '2010-04-13': {'sentiments': [0.9300208687782288,
   0.815800666809082,
   0.815800666809082,
   0.815800666809082],
  'edit_count': 4},
 '2010-04-15': {'sentiments': [0.930020868

# For each day take avg of the sentiment

In [None]:
from statistics import mean

for key in edits:
  if len(edits[key]["sentiments"])>0:
    edits[key]["sentiment"]=mean(edits[key]["sentiments"])
    edits[key]["neg_sentiment"]=len([s for s in edits[key]["sentiments"] if s<0])/len(edits[key]["sentiments"])
  else:
    edits[key]["sentiment"]=0
    edits[key]["neg_sentiment"]=0

  del edits[key]["sentiments"]

In [None]:
edits

{'2009-03-08': {'edit_count': 4,
  'sentiment': -0.5505250245332718,
  'neg_sentiment': 0.75},
 '2009-08-05': {'edit_count': 1,
  'sentiment': 0.748120903968811,
  'neg_sentiment': 0.0},
 '2009-08-06': {'edit_count': 2,
  'sentiment': 0.9957457184791565,
  'neg_sentiment': 0.0},
 '2009-08-14': {'edit_count': 1,
  'sentiment': 0.9300208687782288,
  'neg_sentiment': 0.0},
 '2009-10-13': {'edit_count': 2,
  'sentiment': -0.22750061750411987,
  'neg_sentiment': 0.5},
 '2009-11-18': {'edit_count': 1,
  'sentiment': 0.8839504718780518,
  'neg_sentiment': 0.0},
 '2009-12-08': {'edit_count': 1,
  'sentiment': -0.9869275689125061,
  'neg_sentiment': 1.0},
 '2009-12-17': {'edit_count': 1,
  'sentiment': -0.9975171089172363,
  'neg_sentiment': 1.0},
 '2010-02-23': {'edit_count': 1,
  'sentiment': -0.9994946718215942,
  'neg_sentiment': 1.0},
 '2010-03-18': {'edit_count': 1,
  'sentiment': 0.8758770227432251,
  'neg_sentiment': 0.0},
 '2010-04-13': {'edit_count': 4,
  'sentiment': 0.84435571730136

# Date of the edits are not continous, So we will add the remaining date and zero sentiment

In [None]:
import pandas as pd

edits_df=pd.DataFrame.from_dict(edits,orient="index")

In [None]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-08-05,1,0.748121,0.00
2009-08-06,2,0.995746,0.00
2009-08-14,1,0.930021,0.00
2009-10-13,2,-0.227501,0.50
...,...,...,...
2023-03-10,1,-0.999369,1.00
2023-03-12,1,-0.999464,1.00
2023-03-21,1,-0.994745,1.00
2023-03-22,1,0.994364,0.00


In [None]:
edits_df.index=pd.to_datetime(edits_df.index)

In [None]:
from datetime import datetime

In [None]:
dates =pd.date_range(start="2009-03-08",end=datetime.today())

In [None]:
dates

DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20', '2023-03-21', '2023-03-22',
               '2023-03-23', '2023-03-24'],
              dtype='datetime64[ns]', length=5130, freq='D')

In [None]:
edits_df=edits_df.reindex(dates,fill_value=0)

In [None]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2023-03-20,0,0.000000,0.00
2023-03-21,1,-0.994745,1.00
2023-03-22,1,0.994364,0.00
2023-03-23,3,-0.986778,1.00


In [None]:
rolling_edits=edits_df.rolling(30).mean()

In [None]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2023-03-20,0.700000,-0.274278,0.350000
2023-03-21,0.733333,-0.307436,0.383333
2023-03-22,0.766667,-0.274291,0.383333
2023-03-23,0.800000,-0.307177,0.400000


In [None]:
rolling_edits=rolling_edits.dropna()

In [None]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2023-03-20,0.700000,-0.274278,0.350000
2023-03-21,0.733333,-0.307436,0.383333
2023-03-22,0.766667,-0.274291,0.383333
2023-03-23,0.800000,-0.307177,0.400000


# Save Bitcoin Sentiment as .csv file

In [None]:
from google.colab import files
import pandas as pd

# save the dataframe as a CSV file in Colab
rolling_edits.to_csv('rolling_editz.csv', index=True)

# download the CSV file to your local machine
files.download('rolling_editz.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>