In [1]:
import mwclient
import time

site = mwclient.Site("en.wikipedia.org")
page = site.pages["Bitcoin"]

In [2]:
revs = list(page.revisions())

In [3]:
revs[0]

OrderedDict([('revid', 1220063318),
             ('parentid', 1220030909),
             ('user', 'Swinub'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=4, tm_mday=21, tm_hour=16, tm_min=24, tm_sec=20, tm_wday=6, tm_yday=112, tm_isdst=-1)),
             ('comment',
              '/* Economics and usage */ Unnecessary extra spacing')])

In [4]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])

In [5]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [6]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

  from .autonotebook import tqdm as notebook_tqdm
  import sre_constants
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [7]:
def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

In [8]:
edits = {}

for rev in revs:
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
                         
    if date not in edits:
        edits[date] = {"sentiments": [], "edit_count": 0}
                         
    edits[date]["edit_count"] += 1
                         
    # Check if the 'comment' key exists in the current revision
    if "comment" in rev:
        comment = rev["comment"]
        edits[date]["sentiments"].append(find_sentiment(comment))
    else:
        # If the 'comment' key is missing, handle it accordingly (e.g., skip or log)
        print("Warning: 'comment' key not found in revision:", rev)



In [9]:
from statistics import mean

In [10]:
for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]

In [11]:
edits

{'2009-03-08': {'edit_count': 4,
  'sentiment': -0.5505250543355942,
  'neg_sentiment': 0.75},
 '2009-08-05': {'edit_count': 1,
  'sentiment': 0.748120903968811,
  'neg_sentiment': 0.0},
 '2009-08-06': {'edit_count': 2,
  'sentiment': 0.995745837688446,
  'neg_sentiment': 0.0},
 '2009-08-14': {'edit_count': 1,
  'sentiment': 0.930020809173584,
  'neg_sentiment': 0.0},
 '2009-10-13': {'edit_count': 2,
  'sentiment': -0.22750115394592285,
  'neg_sentiment': 0.5},
 '2009-11-18': {'edit_count': 1,
  'sentiment': 0.8839502334594727,
  'neg_sentiment': 0.0},
 '2009-12-08': {'edit_count': 1,
  'sentiment': -0.9869275689125061,
  'neg_sentiment': 1.0},
 '2009-12-17': {'edit_count': 1,
  'sentiment': -0.9975171089172363,
  'neg_sentiment': 1.0},
 '2010-02-23': {'edit_count': 1,
  'sentiment': -0.9994946718215942,
  'neg_sentiment': 1.0},
 '2010-03-18': {'edit_count': 1,
  'sentiment': 0.8758779168128967,
  'neg_sentiment': 0.0},
 '2010-04-13': {'edit_count': 4,
  'sentiment': 0.8443556129932404

In [12]:
import pandas as pd

In [13]:
# create a dataframe for the sentiments
edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [14]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-08-05,1,0.748121,0.00
2009-08-06,2,0.995746,0.00
2009-08-14,1,0.930021,0.00
2009-10-13,2,-0.227501,0.50
...,...,...,...
2024-04-03,2,-0.001680,0.50
2024-04-04,1,-0.908142,1.00
2024-04-11,2,-0.824999,1.00
2024-04-20,1,-0.933548,1.00


In [15]:
edits_df.index = pd.to_datetime(edits_df.index)

In [16]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08", end=datetime.today())

In [17]:
dates

DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2024-04-14', '2024-04-15', '2024-04-16', '2024-04-17',
               '2024-04-18', '2024-04-19', '2024-04-20', '2024-04-21',
               '2024-04-22', '2024-04-23'],
              dtype='datetime64[ns]', length=5526, freq='D')

In [18]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [19]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2024-04-19,0,0.000000,0.00
2024-04-20,1,-0.933548,1.00
2024-04-21,2,-0.997940,1.00
2024-04-22,0,0.000000,0.00


In [20]:
rolling_edits = edits_df.rolling(30).mean()

In [21]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2024-04-19,0.200000,-0.091111,0.116667
2024-04-20,0.233333,-0.122229,0.150000
2024-04-21,0.300000,-0.155494,0.183333
2024-04-22,0.300000,-0.155494,0.183333


In [22]:
rolling_edits = rolling_edits.dropna()

In [23]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2024-04-19,0.200000,-0.091111,0.116667
2024-04-20,0.233333,-0.122229,0.150000
2024-04-21,0.300000,-0.155494,0.183333
2024-04-22,0.300000,-0.155494,0.183333


In [24]:
rolling_edits.to_csv("wikipedia_edits_bitcoin.csv")