# Objective: Script to crawl wikipedia, search for revision history pages and then looks for the IP addresses on the revision history pages.


In [2]:
# Test call to freegeoip.net
import json
from urllib2 import urlopen

def getCountry(ipAddress):
    response = urlopen("http://freegeoip.net/json/" + ipAddress).read().decode('utf-8')
    responseJson = json.loads(response)
    return responseJson.get("country_code")

print(getCountry("50.78.253.58"))

US


In [3]:
# Test call to understand JSON return calls
import json

jsonString = '{"arrayOfNums":[{"number":0}, \
                              {"number":1}, \
                              {"number":2}], \
              "arrayOfFruits":[{"fruit":"apple"}, \
                               {"fruit":"banana"}, \
                               {"fruit":"pear"}]}'
jsonObj = json.loads(jsonString)

# List of dictionary objects
print(jsonObj.get("arrayOfNums"))

# Dictionary object
print(jsonObj.get("arrayOfNums")[1])

# Integer
print(jsonObj.get("arrayOfNums")[1].get("number") + jsonObj.get("arrayOfNums")[2].get("number"))

# String
print(jsonObj.get("arrayOfFruits")[2].get("fruit"))

[{u'number': 0}, {u'number': 1}, {u'number': 2}]
{u'number': 1}
3
pear


In [4]:
'''
Script to crawl wikipedia, search for revision history pages and then looks for the IP addresses on the revision
history pages.

Steps followed:

1. Retrieves the histories of all Wikipdia articles linked to by the starting page -> getLinks()
2. It then selects a new page randomly and retrieves all revision history pages of articles linked to by that page ->
getHistoryIPs()
3. Maps the IP addresses to the countries they are found in using freegeoip API -> getcountry()
3. It will continue until it hits a page with no links.

'''
from urllib2 import urlopen
from urllib2 import HTTPError
from bs4 import BeautifulSoup
import datetime
import json
import random
import re

random.seed(datetime.datetime.now())

# Find all links in the URL fed in.
def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org" + articleUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    
    # Find all external links on Wikipedia page
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href = re.compile("^(/wiki/)((?!:).)*$"))


# 
def getHistoryIPs(pageUrl):    
    # Format of revision history pages is: 
    # http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history    
    pageUrl = pageUrl.replace("/wiki/", "")
    historyUrl = "http://en.wikipedia.org/w/index.php?title=" + pageUrl + "&action=history"
    
    print("history url is: " + historyUrl)
    
    html = urlopen(historyUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    
    # Finds only the links with class "mw-anonuserlink" which has IP addresses 
    # instead of usernames
    ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"})
    addressList = set()
    
    # Add ipAddress to list
    for ipAddress in ipAddresses:
        addressList.add(ipAddress.get_text())
    
    return addressList


def getCountry(ipAddress):
    try:
        # Use ipAddress obtained from user that made edit and feed to freegeoip.net API
        response = urlopen("http://freegeoip.net/json/" + ipAddress).read().decode('utf-8')
    
    except HTTPError:
        return None
    
    # Feed lat and long value to responseJson object
    responseJson = json.loads(response)
    
    # Get country code from responseJson object.
    return responseJson.get("country_code")
    

# Starting link
links = getLinks("/wiki/Python_(programming_language)")


while(len(links) > 0):
    for link in links:
        print("-------------------") 
        
        # Get ipAddresses
        historyIPs = getHistoryIPs(link.attrs["href"])
        
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print(historyIP + " is from " + country)

    newLink = links[random.randint(0, len(links)-1)].attrs["href"]
    links = getLinks(newLink)

-------------------
history url is: http://en.wikipedia.org/w/index.php?title=Python_(genus)&action=history
66.61.83.123 is from US
76.1.125.190 is from US
69.193.219.186 is from US
148.88.244.103 is from GB
168.212.252.44 is from US
206.176.81.232 is from US
96.10.134.242 is from US
108.192.108.85 is from US
197.161.61.73 is from EG
-------------------
history url is: http://en.wikipedia.org/w/index.php?title=Python_(disambiguation)&action=history
-------------------
history url is: http://en.wikipedia.org/w/index.php?title=Programming_paradigm&action=history
216.186.131.29 is from US
43.252.233.5 is from MY
2607:fb90:5223:c275:0:17:a6be:ca01 is from US
162.247.124.52 is from CA
68.8.169.121 is from US
213.30.118.68 is from PT
196.47.102.43 is from NG
97.32.131.205 is from US
66.87.64.75 is from US
-------------------
history url is: http://en.wikipedia.org/w/index.php?title=Multi-paradigm_programming_language&action=history
-------------------
history url is: http://en.wikipedia.org/

KeyboardInterrupt: 