In [77]:
# load library to make headless http calls
import requests
from bs4 import BeautifulSoup
import json
import csv
import re
import sqlite3

# TODO
# author location - done
# get package information - done
# get API calls

In [2]:
def return_subset(kernels):
    dataset = []
    
    for kernel in kernels:
    # get only the data we need
        subset = {
            'displayName': kernel['author']['displayName'],
            'userId': kernel['author']['userId'],
            'profileUrl': kernel['author']['profileUrl'], # will be used later to get location data
            'id': kernel['id'],
            'title': kernel['title'],
            'totalVotes': kernel['totalVotes'], # used for rating sort
            'languageName' : kernel['languageName'],
            'scriptUrl' : kernel['scriptUrl'],
            'scriptVersionDateCreated' : kernel['scriptVersionDateCreated']
        }
        
        # append the subset to the dataset
        dataset.append(subset)
    
    return dataset

# total number of entries to request
request_size = 4500

# size of each http request
page_size = 1000

def get_data():
    
    # create empty list to store data
    master_dataset = []
    
    
    for n in range(0,request_size,page_size):
        url = 'https://www.kaggle.com/kernels.json?sortBy=hotness&group=everyone&pageSize={0}&language=all&outputType=all&isMixedPrivacyLayout=true&startRow={1}'.format(page_size,n)
        print(url)
        page = requests.get(url) 
        kernels = json.loads(page.text)

        # print size of request return
        print(len(kernels))

        master_dataset = master_dataset + return_subset(kernels)

        # write dictionary to JSON
        with open('kernels.json', 'w') as fp:
            json.dump(master_dataset, fp)

        # write dictionary to CSV
        with open('kernels.csv', 'w') as f:  # Just use 'w' mode in 3.x
            w = csv.DictWriter(f, master_dataset[0].keys())
            w.writeheader()
            w.writerows(master_dataset)


    




In [3]:
# get location information from author profile page
def get_author_location(profile_url):
    url = 'https://www.kaggle.com{0}'.format(profile_url)
    
    # make http request for page
    page = requests.get(url) 
    
    # find location key value pairs in source using regex
    match_object = re.findall( r'\"(country|region|city)\":\"(.*?)\"', page.text, re.I|re.M)
    
    # turn match object into dictionary to be merged into kernel object
    location_data = { match[0]:match[1] for match in match_object }
    
    #example: {'country': 'United Kingdom', 'region': 'England', 'city': 'London'}
    
    return location_data

get_author_location('/pmarcelino')



{'city': 'Lisbon', 'country': 'PT'}

In [4]:
# get package list from source code of kernel
# does not look for combined import statements

def get_package_list(code_url):
    url = 'https://www.kaggle.com{0}/code'.format(code_url)
    
    # make http request for page
    page = requests.get(url) 
    
    # find import references using regex
    match_object = re.findall( r'import (\w*)', page.text, re.I|re.M)
    
    # get unique entries
    package_list = list(set(match_object))
    
    # filter out non-package names
    filter_words = ['the']
    package_list = list(set(package_list) - set(filter_words))
    
    return package_list
    
get_package_list('/pmarcelino/comprehensive-data-exploration-with-python')

['stats',
 'norm',
 'StandardScaler',
 'seaborn',
 'matplotlib',
 'numpy',
 'pandas']

In [96]:
# get a count of all packages

conn=sqlite3.connect("Kernels_db.db")

conn.row_factory = sqlite3.Row

c = conn.cursor()

c.execute("SELECT * FROM Kernels")
results=c.fetchall()

all_packages = []

for row in results:
    packages = dict(row)['packages'].split(",")
    all_packages = all_packages + packages

    
packages_count = collections.Counter(all_packages)

packages_count.most_common(10)

[('numpy', 1219),
 ('pandas', 1211),
 ('matplotlib', 794),
 ('seaborn', 526),
 ('ggplot2', 319),
 ('check_output', 302),
 ('dplyr', 260),
 ('xgboost', 256),
 ('train_test_split', 235),
 ('', 206)]

In [44]:
# only get python code
# look for only top 5 libraries
# methods (api) calls two levels deep
# deficiencies: possibility of inaccurate class/method counts due to name overlaps between libraries

from urllib.request import urlopen
import nbformat
import io
import tokenize
from nbconvert import PythonExporter

def strip_comments(code):
    code = str(code)
    return re.sub(r'(?m)^ *#.*\n?', '', code)
   
def get_source_code(url):
    
    # read notebook from url
    response = urlopen(url).read().decode()

    # read http response into nbconvert object
    notebook = nbformat.reads(response, as_version=4)
    
    # instantiate PythonExporter  
    py_exporter = PythonExporter()

    # convert notebook into python source (ignoring markup, etc)
    script, resources = py_exporter.from_notebook_node(notebook)
    
    # remove all comments from source
    raw_script = stripComments(script)
    
    return raw_script

url = "https://kagglesds.blob.core.windows.net/script-versions/1766385/notebook/__notebook__.ipynb?sv=2015-12-11&sr=b&sig=ytykV0edLA8D93%2Fn5eNntNI5%2Fd8q9aI%2F9%2F2Rmxx0aQw%3D&se=2017-11-18T03%3A41%3A04Z&sp=r"

code = get_source_code(url)



In [75]:
import seaborn
import numpy
import pandas


import collections

#read file for testing
py_source_test = open("test.py",'r').read()

# get class names for a library
def get_library_classes(name):
    names = dir(eval(name))    
    class_names = list(filter(lambda x: x[0].isupper(), names))
    return class_names

# get method names for a library or class
def get_library_methods(name):
    names = dir(eval(name))    
    class_names = list(filter(lambda x: x[0].islower(), names))
    return class_names

# def search_for_methods(method_names):

names = get_library_methods("seaborn")


match_object = re.findall( r'.(pairplot|lmplot|regplot)\(', py_source_test, re.I|re.M)


match_object

collections.Counter(match_object)

Counter({'lmplot': 1, 'pairplot': 1, 'regplot': 3})