In [1]:
import mwclient
import time

site = mwclient.Site("en.wikipedia.org")
page = site.pages['Ripple']

Reviewing "Ripple" Wikipedia page

In [2]:
revs = list(page.revisions())

In [3]:
revs[0]

OrderedDict([('revid', 1187735781),
             ('parentid', 1184109175),
             ('user', 'Sdf'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=12, tm_mday=1, tm_hour=2, tm_min=46, tm_sec=16, tm_wday=4, tm_yday=335, tm_isdst=-1)),
             ('comment', '/* Arts and entertainment */')])

In [4]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])

In [5]:
revs[0]

OrderedDict([('revid', 38507906),
             ('parentid', 0),
             ('user', 'SRipple'),
             ('timestamp',
              time.struct_time(tm_year=2006, tm_mon=2, tm_mday=6, tm_hour=20, tm_min=43, tm_sec=17, tm_wday=0, tm_yday=37, tm_isdst=-1)),
             ('comment', '')])

This function analyzes the sentiment of a given text using the sentiment analysis pipeline

In [8]:
from transformers import pipeline
sentament_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentament_pipeline([text[: 250]]) [0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





Iterating over the revisions of the "Ripple" Wikipedia page, extracting relevant information, and then analyzing the sentiment of each revision's comment using the find_sentiment function. The sentiment scores are then stored in a dictionary named edits, along with the edit count for each date.

In [9]:
edits = {}

for rev in revs:
    # Extract the date from the timestamp
    date = time.strftime("%Y-%m-%d", rev["timestamp"])

    # Initialize a new entry in the edits dictionary if the date is not already present
    if date not in edits:
        edits[date] = {"sentiments": [], "edit_count": 0}
        
    # Increment the edit count for the current date
    edits[date]["edit_count"] += 1

    # Extract the comment from the revision and analyze its sentiment
    comment = rev["comment"]
    edits[date]["sentiments"].append(find_sentiment(comment))


Calculate the mean sentiment score and the proportion of negative sentiment for each date in the edits dictionary. It iterates through each date in the dictionary, calculates the mean sentiment score using the mean function from the statistics module, and calculates the proportion of negative sentiment by dividing the count of negative sentiment scores by the total number of sentiment scores.

In [10]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0

    del edits[key]["sentiments"]

Converts the edits dictionary into a pandas DataFrame named edits_df.

In [11]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient='index')


In [12]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-02-06,3,0.748121,0.0
2006-02-07,2,0.869017,0.0
2006-02-26,2,-0.053855,0.5
2006-05-05,2,-0.996903,1.0
2006-05-11,1,0.748121,0.0
...,...,...,...
2021-10-30,2,-0.111816,0.5
2023-01-05,5,-0.299251,0.6
2023-09-06,3,-0.699478,1.0
2023-11-08,1,0.986014,0.0


Converting the index of the DataFrame edits_df to a datetime data type.

In [13]:
edits_df.index = pd.to_datetime(edits_df.index)

In [16]:
from datetime import datetime

dates = pd.date_range(start='2013-09-28', end=datetime.today())

In [22]:
dates

DatetimeIndex(['2013-09-28', '2013-09-29', '2013-09-30', '2013-10-01',
               '2013-10-02', '2013-10-03', '2013-10-04', '2013-10-05',
               '2013-10-06', '2013-10-07',
               ...
               '2024-03-18', '2024-03-19', '2024-03-20', '2024-03-21',
               '2024-03-22', '2024-03-23', '2024-03-24', '2024-03-25',
               '2024-03-26', '2024-03-27'],
              dtype='datetime64[ns]', length=3834, freq='D')

Aligning the index of the DataFrame edits_df with the dates date range

In [23]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [24]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2013-09-28,0,0.0,0.0
2013-09-29,0,0.0,0.0
2013-09-30,0,0.0,0.0
2013-10-01,0,0.0,0.0
2013-10-02,0,0.0,0.0
...,...,...,...
2024-03-23,0,0.0,0.0
2024-03-24,0,0.0,0.0
2024-03-25,0,0.0,0.0
2024-03-26,0,0.0,0.0


In [25]:
rolling_edits = edits_df.rolling(30).mean()

In [26]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2013-09-28,,,
2013-09-29,,,
2013-09-30,,,
2013-10-01,,,
2013-10-02,,,
...,...,...,...
2024-03-23,0.0,0.0,0.0
2024-03-24,0.0,0.0,0.0
2024-03-25,0.0,0.0,0.0
2024-03-26,0.0,0.0,0.0


Remove missing values (NaN)

In [27]:
rolling_edits = rolling_edits.dropna()

In [28]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2013-10-27,0.066667,0.054988,0.000000
2013-10-28,0.100000,0.021748,0.033333
2013-10-29,0.100000,0.021748,0.033333
2013-10-30,0.100000,0.021748,0.033333
2013-10-31,0.100000,0.021748,0.033333
...,...,...,...
2024-03-23,0.000000,0.000000,0.000000
2024-03-24,0.000000,0.000000,0.000000
2024-03-25,0.000000,0.000000,0.000000
2024-03-26,0.000000,0.000000,0.000000


In [29]:
rolling_edits.to_csv("wikipedia_edits.csv")