In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#
# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests

In [14]:
import pandas as pd
movies = pd.read_csv('/content/drive/MyDrive/AUT 2023/DATA 512/thank_the_academy.AUG.2023.csv')
movie_names = list(movies['name'])
movie_names.sort()

First step is to acquire desktop page views data, with start date 2015 July up till 2023 September

In [21]:
#########
#
#    CONSTANTS
#

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include your email address which will allow them
# to contact you if something happens - such as - your code exceeding rate limits - or some other error
REQUEST_HEADERS = {
    'User-Agent': 'andixit@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = movie_names

# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "desktop",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}

In [47]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageviews_per_article(article_title = None,
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT,
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS,
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['article'] = article_title

    if not request_template['article']:
        raise Exception("Must supply an article title to make a pageviews request.")

    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(request_template['article'].replace(' ','_'),safe='')
    request_template['article'] = article_title_encoded

    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

Iterate through movie titles to get all pageviews data for desktop access.

Remove 'access' parameter

In [25]:
total_views = []
for article in ARTICLE_TITLES:
  print(article)
  views = request_pageviews_per_article(article)
  for month in views['items']:
    del month['access']
    total_views.append(month)

12 Years a Slave (film)
1917 (2019 film)
20 Feet from Stardom
20,000 Leagues Under the Sea (1954 film)
2001: A Space Odyssey (film)
49th Parallel (film)
7 Faces of Dr. Lao
7th Heaven (1927 film)
8 Mile (film)
8½
A Beautiful Mind (film)
A Boy and His Dog (1946 film)
A Chance to Live
A Christmas Carol (1971 film)
A Close Shave
A Damsel in Distress (1937 film)
A Double Life (1947 film)
A Fantastic Woman
A Farewell to Arms (1932 film)
A Fish Called Wanda
A Free Soul
A Funny Thing Happened on the Way to the Forum (film)
A Girl in the River: The Price of Forgiveness
A Greek Tragedy
A Herb Alpert and the Tijuana Brass Double Feature
A Hole in the Head
A Letter to Three Wives
A Little Night Music (film)
A Little Romance
A Man and a Woman
A Man for All Seasons (1966 film)
A Midsummer Night's Dream (1935 film)
A Note of Triumph: The Golden Age of Norman Corwin
A Passage to India (film)
A Patch of Blue
A Place in the Sun (1951 film)
A Place to Stand (film)
A River Runs Through It (film)
A Room wi

Dump into a JSON file

In [37]:
monthly_views_desktop = open('/content/drive/MyDrive/AUT 2023/DATA 512/academy_monthly_desktop_201507-202309.json', "w")
json.dump(total_views, monthly_views_desktop, indent = 4)
monthly_views_desktop.close()

For mobile pageviews, first, we get mobile app views data

In [27]:
##MOBILE APP

ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "mobile-app",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}

Get data for all movies, remove access param

In [31]:
total_views_mobile_app = []
for article in ARTICLE_TITLES:
  print(article)
  views = request_pageviews_per_article(article)
  for month in views['items']:
    del month['access']
    total_views_mobile_app.append(month)

12 Years a Slave (film)
1917 (2019 film)
20 Feet from Stardom
20,000 Leagues Under the Sea (1954 film)
2001: A Space Odyssey (film)
49th Parallel (film)
7 Faces of Dr. Lao
7th Heaven (1927 film)
8 Mile (film)
8½
A Beautiful Mind (film)
A Boy and His Dog (1946 film)
A Chance to Live
A Christmas Carol (1971 film)
A Close Shave
A Damsel in Distress (1937 film)
A Double Life (1947 film)
A Fantastic Woman
A Farewell to Arms (1932 film)
A Fish Called Wanda
A Free Soul
A Funny Thing Happened on the Way to the Forum (film)
A Girl in the River: The Price of Forgiveness
A Greek Tragedy
A Herb Alpert and the Tijuana Brass Double Feature
A Hole in the Head
A Letter to Three Wives
A Little Night Music (film)
A Little Romance
A Man and a Woman
A Man for All Seasons (1966 film)
A Midsummer Night's Dream (1935 film)
A Note of Triumph: The Golden Age of Norman Corwin
A Passage to India (film)
A Patch of Blue
A Place in the Sun (1951 film)
A Place to Stand (film)
A River Runs Through It (film)
A Room wi

Then, get the mobile website pageviews data

In [33]:
##MOBILE WEB

ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "mobile-web",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}

In [35]:
total_views_mobile_web = []
for article in ARTICLE_TITLES:
  print(article)
  views = request_pageviews_per_article(article)
  for month in views['items']:
    del month['access']
    total_views_mobile_web.append(month)

12 Years a Slave (film)
1917 (2019 film)
20 Feet from Stardom
20,000 Leagues Under the Sea (1954 film)
2001: A Space Odyssey (film)
49th Parallel (film)
7 Faces of Dr. Lao
7th Heaven (1927 film)
8 Mile (film)
8½
A Beautiful Mind (film)
A Boy and His Dog (1946 film)
A Chance to Live
A Christmas Carol (1971 film)
A Close Shave
A Damsel in Distress (1937 film)
A Double Life (1947 film)
A Fantastic Woman
A Farewell to Arms (1932 film)
A Fish Called Wanda
A Free Soul
A Funny Thing Happened on the Way to the Forum (film)
A Girl in the River: The Price of Forgiveness
A Greek Tragedy
A Herb Alpert and the Tijuana Brass Double Feature
A Hole in the Head
A Letter to Three Wives
A Little Night Music (film)
A Little Romance
A Man and a Woman
A Man for All Seasons (1966 film)
A Midsummer Night's Dream (1935 film)
A Note of Triumph: The Golden Age of Norman Corwin
A Passage to India (film)
A Patch of Blue
A Place in the Sun (1951 film)
A Place to Stand (film)
A River Runs Through It (film)
A Room wi

Dump both into separate JSON files

In [38]:
monthly_views_mobile_app = open('/content/drive/MyDrive/AUT 2023/DATA 512/academy_monthly_mobile_app_201507-202309.json', "w")
json.dump(total_views_mobile_app, monthly_views_mobile_app, indent = 4)
monthly_views_mobile_app.close()

In [39]:
monthly_views_mobile_web = open('/content/drive/MyDrive/AUT 2023/DATA 512/academy_monthly_mobile_web_201507-202309.json', "w")
json.dump(total_views_mobile_web, monthly_views_mobile_web, indent = 4)
monthly_views_mobile_web.close()

Reading into Pandas dataframes for ease of merging

In [40]:
df_mob_app = pd.read_json('/content/drive/MyDrive/AUT 2023/DATA 512/academy_monthly_mobile_app_201507-202309.json')
df_mob_web = pd.read_json('/content/drive/MyDrive/AUT 2023/DATA 512/academy_monthly_mobile_web_201507-202309.json')

In [41]:
df_mob_app.head()

Unnamed: 0,project,article,granularity,timestamp,agent,views
0,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 13:41:40,user,2575
1,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 16:28:20,user,2435
2,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 19:15:00,user,1921
3,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 22:01:40,user,2045
4,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-09 00:48:20,user,2097


In [42]:
df_mob_web.head()

Unnamed: 0,project,article,granularity,timestamp,agent,views
0,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 13:41:40,user,72883
1,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 16:28:20,user,61656
2,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 19:15:00,user,51995
3,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 22:01:40,user,59643
4,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-09 00:48:20,user,59574


Add pageviews for mobile-web and mobile-app into the total mobile pageviews

In [43]:
df_mob = df_mob_web[['project','article','granularity','timestamp','agent']]
df_mob['views'] = df_mob_web['views'] + df_mob_app['views']
df_mob.head()

Unnamed: 0,project,article,granularity,timestamp,agent,views
0,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 13:41:40,user,75458
1,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 16:28:20,user,64091
2,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 19:15:00,user,53916
3,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-08 22:01:40,user,61688
4,en.wikipedia,12_Years_a_Slave_(film),monthly,2033-11-09 00:48:20,user,61671


Save to JSON file

In [45]:
df_mob.to_json('/content/drive/MyDrive/AUT 2023/DATA 512/academy_monthly_mobile_201507-202309.json',indent=4,orient='records')

For cumulative data, use all-access param

In [46]:
##CUMULATIVE ACCESS

ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "all-access",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}

In [48]:
total_views_all_access = []
for article in ARTICLE_TITLES:
  print(article)
  views = request_pageviews_per_article(article)
  for month in views['items']:
    del month['access']
    total_views_all_access.append(month)

12 Years a Slave (film)
1917 (2019 film)
20 Feet from Stardom
20,000 Leagues Under the Sea (1954 film)
2001: A Space Odyssey (film)
49th Parallel (film)
7 Faces of Dr. Lao
7th Heaven (1927 film)
8 Mile (film)
8½
A Beautiful Mind (film)
A Boy and His Dog (1946 film)
A Chance to Live
A Christmas Carol (1971 film)
A Close Shave
A Damsel in Distress (1937 film)
A Double Life (1947 film)
A Fantastic Woman
A Farewell to Arms (1932 film)
A Fish Called Wanda
A Free Soul
A Funny Thing Happened on the Way to the Forum (film)
A Girl in the River: The Price of Forgiveness
A Greek Tragedy
A Herb Alpert and the Tijuana Brass Double Feature
A Hole in the Head
A Letter to Three Wives
A Little Night Music (film)
A Little Romance
A Man and a Woman
A Man for All Seasons (1966 film)
A Midsummer Night's Dream (1935 film)
A Note of Triumph: The Golden Age of Norman Corwin
A Passage to India (film)
A Patch of Blue
A Place in the Sun (1951 film)
A Place to Stand (film)
A River Runs Through It (film)
A Room wi

Dump into JSON file

In [49]:
monthly_views_cumulative = open('/content/drive/MyDrive/AUT 2023/DATA 512/academy_monthly_cumulative_201507-202309.json', "w")
json.dump(total_views_all_access, monthly_views_cumulative, indent = 4)
monthly_views_cumulative.close()