In [11]:
import mwclient
import time

site = mwclient.Site("en.wikipedia.org")
page = site.pages['Ripple']

In [12]:
revs = list(page.revisions())

In [13]:
revs[0]

OrderedDict([('revid', 1187735781),
             ('parentid', 1184109175),
             ('user', 'Sdf'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=12, tm_mday=1, tm_hour=2, tm_min=46, tm_sec=16, tm_wday=4, tm_yday=335, tm_isdst=-1)),
             ('comment', '/* Arts and entertainment */')])

In [14]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])

In [15]:
revs[0]

OrderedDict([('revid', 38507906),
             ('parentid', 0),
             ('user', 'SRipple'),
             ('timestamp',
              time.struct_time(tm_year=2006, tm_mon=2, tm_mday=6, tm_hour=20, tm_min=43, tm_sec=17, tm_wday=0, tm_yday=37, tm_isdst=-1)),
             ('comment', '')])

In [16]:
from transformers import pipeline
sentament_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentament_pipeline([text[: 250]]) [0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [17]:
edits = {}

for rev in revs:
    # Extract the date from the timestamp
    date = time.strftime("%Y-%m-%d", rev["timestamp"])

    # Initialize a new entry in the edits dictionary if the date is not already present
    if date not in edits:
        edits[date] = {"sentiments": [], "edit_count": 0}
        
    # Increment the edit count for the current date
    edits[date]["edit_count"] += 1

    # Extract the comment from the revision and analyze its sentiment
    comment = rev["comment"]
    edits[date]["sentiments"].append(find_sentiment(comment))


In [18]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0

    del edits[key]["sentiments"]

In [20]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient='index')


In [21]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-02-06,3,0.748121,0.0
2006-02-07,2,0.869017,0.0
2006-02-26,2,-0.053855,0.5
2006-05-05,2,-0.996903,1.0
2006-05-11,1,0.748121,0.0
...,...,...,...
2021-10-30,2,-0.111816,0.5
2023-01-05,5,-0.299251,0.6
2023-09-06,3,-0.699478,1.0
2023-11-08,1,0.986014,0.0


In [24]:
edits_df.index = pd.to_datetime(edits_df.index)

In [25]:
from datetime import datetime

dates = pd.date_range(start='2006-02-06', end=datetime.today())

In [26]:
dates

DatetimeIndex(['2006-02-06', '2006-02-07', '2006-02-08', '2006-02-09',
               '2006-02-10', '2006-02-11', '2006-02-12', '2006-02-13',
               '2006-02-14', '2006-02-15',
               ...
               '2024-03-11', '2024-03-12', '2024-03-13', '2024-03-14',
               '2024-03-15', '2024-03-16', '2024-03-17', '2024-03-18',
               '2024-03-19', '2024-03-20'],
              dtype='datetime64[ns]', length=6618, freq='D')

In [27]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [28]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-02-06,3,0.748121,0.0
2006-02-07,2,0.869017,0.0
2006-02-08,0,0.000000,0.0
2006-02-09,0,0.000000,0.0
2006-02-10,0,0.000000,0.0
...,...,...,...
2024-03-16,0,0.000000,0.0
2024-03-17,0,0.000000,0.0
2024-03-18,0,0.000000,0.0
2024-03-19,0,0.000000,0.0


In [29]:
rolling_edits = edits_df.rolling(30).mean()

In [30]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-02-06,,,
2006-02-07,,,
2006-02-08,,,
2006-02-09,,,
2006-02-10,,,
...,...,...,...
2024-03-16,0.0,0.0,0.0
2024-03-17,0.0,0.0,0.0
2024-03-18,0.0,0.0,0.0
2024-03-19,0.0,0.0,0.0


In [31]:
rolling_edits = rolling_edits.dropna()

In [32]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-03-07,0.233333,0.052109,0.016667
2006-03-08,0.133333,0.027172,0.016667
2006-03-09,0.066667,-0.001795,0.016667
2006-03-10,0.066667,-0.001795,0.016667
2006-03-11,0.066667,-0.001795,0.016667
...,...,...,...
2024-03-16,0.000000,0.000000,0.000000
2024-03-17,0.000000,0.000000,0.000000
2024-03-18,0.000000,0.000000,0.000000
2024-03-19,0.000000,0.000000,0.000000


In [33]:
rolling_edits.to_csv("wikipedia_edits.csv")