In [1]:
import requests
from requests import get
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

In [2]:
def flatten_dict(df, column_name):
    df1 = df.copy()
    flatten_col = json_normalize(df1[column_name].tolist())
    df1 = pd.concat([df1,flatten_col], sort = False, axis = 1)
    df1 = df1.drop(column_name, axis = 1)
    return df1

Read the API here...

https://www.mediawiki.org/wiki/API:Revisions

In [3]:
BASE_URL = "http://en.wikipedia.org/w/api.php"
TITLE = 'California Polytechnic State University'

parameters = { 'action': 'query',
           'format': 'json',
           'continue': '',
           'titles': TITLE,
           'prop': 'revisions',
           'rvprop': 'ids|userid|user|timestamp|comment|content|tags',
           'rvlimit': 'max'}

wp_call = requests.get(BASE_URL, params=parameters)
response = wp_call.json()

In [4]:
total_revisions = 0
pages = []
counter = 0
num_calls = 10
while True:
    wp_call = requests.get(BASE_URL, params=parameters)
    response = wp_call.json()
    
    val = str(json_normalize(response,["query","pages"])[0][0])
    df = json_normalize(response,["query","pages",val,"revisions"])
    pages.append(df)

    for page_id in response['query']['pages']:
        total_revisions += len(response['query']['pages'][page_id]['revisions'])

    if 'continue' in response:
        parameters['continue'] = response['continue']['continue']
        parameters['rvcontinue'] = response['continue']['rvcontinue']
        counter += 1
    else:
        break
    if counter == num_calls:
        break

In [5]:
revisions = pd.concat(pages, sort = False)

In [6]:
def ipInfo(addr=''):
    from urllib.request import urlopen
    from json import load
    if addr == '':
        url = 'https://ipinfo.io/json'
    else:
        url = 'https://ipinfo.io/' + addr + '/json'
    res = urlopen(url)
    #response from url(if res==None then check connection)
    data = load(res)
    #will load the json response into data
    dat = pd.DataFrame(data,index = range(len(data)))
    return dat

In [7]:
location_info = revisions[revisions["userid"] == 0]["user"].apply(ipInfo)

In [8]:
loc_df = pd.concat(location_info.values, sort = False, ignore_index = True)

In [9]:
def get_fips(loc):
    loc = loc.split(",")
    lat = loc[0]
    long = loc[1]
    url = ("https://geo.fcc.gov/api/census/block/find?latitude={0}&longitude={1}&format=json".
           format(lat,long))
    res = requests.get(url)
    data = res.json()
    return data["County"]["FIPS"]

In [10]:
unique_loc = loc_df.drop_duplicates()

unique_loc["FIPS"] = unique_loc["loc"].apply(get_fips)

loc_fips = loc_df.merge(unique_loc[["city","FIPS"]].drop_duplicates(),
                         on ="city")
loc_fips = loc_fips[(loc_fips["FIPS"] != "None") & ~(loc_fips["FIPS"].isna())]
loc_fips["FIPS"] = loc_fips["FIPS"].astype("int64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
import plotly.plotly as py
import plotly.figure_factory as ff

values = loc_fips["FIPS"].value_counts().values
fips = loc_fips["FIPS"].value_counts().index

endpts = list(np.mgrid[min(values):max(values):4j])
colorscale = ["#030512","#1d1d3b","#323268","#3d4b94","#3e6ab0",
              "#4989bc","#60a7c7","#85c5d3","#b7e0e4","#eafcfd"]

fig = ff.create_choropleth(
    fips=fips, values=values, show_state_data=True,
    colorscale=colorscale, binning_endpoints=endpts, round_legend_values=True,
    plot_bgcolor='rgb(229,229,229)',
    paper_bgcolor='rgb(229,229,229)',
    legend_title='Edits by County',
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},
    exponent_format=True,
    title = 'Map of Anonymous Revisions for Cal Poly Wiki in US',
)
py.iplot(fig, filename='choropleth_california')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Consider using IPython.display.IFrame instead



In [12]:
import difflib

In [13]:
def _unidiff_output(expected, actual):
    expected=expected.splitlines(1)
    actual=actual.splitlines(1)
    diff=difflib.unified_diff(expected, actual)
    return ''.join(diff)


In [14]:
print(_unidiff_output(revisions["*"].iloc[0],revisions["*"].iloc[22]))

--- 
+++ 
@@ -17,7 +17,7 @@
 | postgrad = 881 (Fall 2016)<ref name=Enrollment/>
 | city = [[San Luis Obispo, California|San Luis Obispo]]
 | state = [[California]]
-| country = United States
+| country = U.S.
 | campus = Suburban, {{convert|9678|acre|ha}} Total; {{convert|1321|acre|ha}} for the Main Campus; {{convert|155|acre|ha}} for the Campus Core<ref name=quickfacts2012>{{cite web|url=http://www.calpoly.edu/aboutcp/our_campus.html |title=Our Campus - About Cal Poly - Cal Poly, San Luis Obispo |publisher=www.calpoly.edu |accessdate=July 17, 2016}}</ref>
 | colors = Green and Gold<ref>{{cite web|url=https://universitymarketing.calpoly.edu/resources/graphic-standards/primary-colors/|title=Cal Poly Marketing -   Primary Colors|website=universitymarketing.calpoly.edu|accessdate=16 May 2018}}</ref><br/>{{color box|#035642}}&nbsp;{{color box|#B4A76C}}
 | mascot = Musty the Mustang
@@ -28,7 +28,7 @@
 | logo = [[File:Cal Poly San Luis Obispo wordmark.svg|150px]]
 }}
 
-'''California Polytec

In [15]:
revisions["timestamp"]

0     2019-04-25T07:41:33Z
1     2019-04-25T07:07:00Z
2     2019-04-25T07:06:19Z
3     2019-04-25T07:05:55Z
4     2019-04-25T06:58:12Z
5     2019-04-20T23:15:23Z
6     2019-04-19T20:52:23Z
7     2019-04-19T20:52:02Z
8     2019-04-03T11:12:50Z
9     2019-04-03T11:12:24Z
10    2019-04-03T04:50:17Z
11    2019-04-03T04:45:08Z
12    2019-03-13T13:10:54Z
13    2019-03-13T08:54:08Z
14    2019-03-13T08:50:27Z
15    2019-03-13T08:49:37Z
16    2019-03-13T08:47:40Z
17    2019-03-13T08:46:59Z
18    2019-03-13T08:44:58Z
19    2019-03-13T08:43:37Z
20    2019-03-13T08:43:24Z
21    2019-03-13T08:40:34Z
22    2019-03-05T19:18:41Z
23    2019-02-20T06:20:19Z
24    2019-01-25T14:18:13Z
25    2019-01-24T15:17:09Z
26    2019-01-24T15:16:45Z
27    2019-01-24T02:36:19Z
28    2019-01-17T23:34:12Z
29    2019-01-17T23:34:06Z
              ...         
20    2014-12-05T21:03:25Z
21    2014-12-03T19:18:23Z
22    2014-11-28T08:11:24Z
23    2014-11-21T22:27:10Z
24    2014-11-21T22:27:01Z
25    2014-11-21T16:33:49Z
2