In [1]:
import pandas as pd
from mwviews.api import PageviewsClient
import numpy as np

def get_direct_link(shared_link):
    return 'https://drive.google.com/uc?export=download&id=' + shared_link.split('/')[-2]

link = "https://drive.google.com/file/d/17Q2QEqe33LTyVMDTXcZj2K9IHU-kqkFY/view?usp=drive_link"
direct_link = get_direct_link(link)
players_df = pd.read_csv(direct_link)

end_date = '20230716'

In [2]:
def get_player_data(players):

    results = pd.DataFrame(columns=['Date', 'wiki', 'Player'])
    for player in players:
        try:
            data = client.article_views('en.wikipedia', player,
                                        granularity='daily',
                                        start='20150104',
                                        end=end_date)
        except:
            print("Kinda error")
            continue
        data_df = pd.DataFrame.from_dict(data, orient='index')
        column = data_df.columns[0]
        data_df = data_df.reset_index()
        data_df.rename(columns={'index' : 'Date',
                                 column : 'wiki'}, inplace=True)
        data_df['Player'] = player

        date_mask = data_df['Date'].isin(results['Date'])
        player_mask = data_df['Player'].isin(results['Player'])
        dup_mask = date_mask & player_mask
        new_data = data_df[~dup_mask]
        if new_data.shape[0] == 0:
            continue
            
        new_data.fillna(0, inplace=True)
        new_data['year'] = new_data['Date'].dt.year
        new_data['month'] = new_data['Date'].dt.month
        new_data['day'] = new_data['Date'].dt.day
        results = pd.concat([results,new_data])

    return results

In [3]:
def combine_duplicate_players(df):
    grouped_players = df.groupby('wiki_title')['original_names'].apply(np.unique).reset_index()
    grouped_players['original_names'] = grouped_players['original_names'].apply(tuple)
    merged = df.merge(grouped_players, on='wiki_title', how='left')
    merged.drop(columns='original_names_x', inplace=True)
    merged.rename(columns={'original_names_y': 'original_names'}, inplace=True)
    merged.drop_duplicates(inplace=True)

    return merged


In [4]:
players = list(players_df['wiki_title'].dropna().unique())

client = PageviewsClient(user_agent="<jg19184@bristol.ac.uk> Buzz Factor Research")

results = get_player_data(players)

merged = players_df[['original_names', 'wiki_title']].merge\
  (results, left_on='wiki_title', right_on='Player',  how='left')
merged.drop(columns=['Player'], inplace=True)

print('There are this many null values:',merged.wiki.isnull().values.sum())
print('These are the null values:', merged.loc[merged.wiki.isnull()==True].wiki_title.values)

merged = merged[merged.wiki.notnull()]
nulls = merged.groupby('Date')['wiki'].sum() == 0
null_dates = nulls[nulls == True].index.tolist()
print('Dates with no views:', nulls.loc[nulls == True].index)
merged = merged[~merged.Date.isin(null_dates)]

merged = combine_duplicate_players(merged)

merged.sort_values(['wiki_title', 'Date'], inplace=True)
merged.to_csv("mens_tennis_wikipedia.csv", encoding='utf8', index=False)

There are this many null values: 0
These are the null values: []
Dates with no views: DatetimeIndex(['2015-01-04', '2015-01-05', '2015-01-06', '2015-01-07',
               '2015-01-08', '2015-01-09', '2015-01-10', '2015-01-11',
               '2015-01-12', '2015-01-13',
               ...
               '2015-06-21', '2015-06-22', '2015-06-23', '2015-06-24',
               '2015-06-25', '2015-06-26', '2015-06-27', '2015-06-28',
               '2015-06-29', '2015-06-30'],
              dtype='datetime64[ns]', name='Date', length=178, freq=None)
