In [1]:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen, urlretrieve
from selenium import webdriver
from tqdm import tqdm
import os, re, time, random
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing
import pandas as pd
from functools import reduce
import inspect
from selenium.webdriver.common.keys import Keys

# Analyze Pandas usage in Kaggle Kernels

## 1. Get urls of top kernels by vote

In [89]:
browser = webdriver.Chrome(os.getcwd() + '/dependencies/chromedriver')

browser.get("https://www.kaggle.com/kernels?sortBy=votes&group=everyone&pageSize=20&language=Python")
time.sleep(1)

elem = browser.find_element_by_tag_name("body")

no_of_pagedowns = 150

while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(1)
    no_of_pagedowns-=1

soup = BeautifulSoup(browser.page_source, "html.parser")

In [125]:
kernel_links = ['https://www.kaggle.com' + a['href'] + '/code' for a in soup.find_all("a", class_="block-link__anchor") if ('teaching-notebook-for-total-imaging-newbies' not in a['href'] and 'mentions-of-kaggle-on-hacker-news' not in a['href'])]

In [131]:
len(kernel_links)

1858

## 2. Get url of code files from kernels page

In [171]:
browser = webdriver.Chrome('/Users/adithya/Documents/university/research/scrapingKaggle/chromedriver')

In [172]:
def get_code_from_kernel(kernel_link):
    if ('stkbailey' in kernel_link or 
        'mentions-of-kaggle-on-hacker-news' in kernel_link or 
        'keras-rcnn-based-overview-wip' in kernel_link or 
        'why-employees-leave-and-how-to-predict-it' in kernel_link or 
        'boosting-stacking-and-bayes-searching' in kernel_link or
        'resnet50-features-xgboost' in kernel_link or 
        'github-commit-messages' in kernel_link):
        return None
    try:
        browser.get(kernel_link)
        innerHTML = browser.execute_script("return document.body.innerHTML")
        soup = BeautifulSoup(innerHTML, 'html.parser')

        link = soup.find_all("a", class_="script-code-pane__download")[0]['href']

        return link
    except Exception as e:
        print("E", e)
        return None

In [173]:
def pull_partition(start, end, partition):
    code_links_write_part = []
    for kl in tqdm(kernel_links[start:end]):
        code_links_write_part.append(get_code_from_kernel(kl))
        time.sleep(1)

    code_file_write_part = open('links/code_links_part' + str(partition) + '.txt', 'w')
    for code_link in code_links_write_part:
        code_file_write_part.write("%s\n" % code_link)
        
    print("Finished partition %d" % partition)

In [None]:
STEP = 100
partitions = [(STEP*i, STEP*(i+1), i) for i in range(len(kernel_links)//STEP + 1)]
for p in partitions[17:]:
    pull_partition(p[0], p[1], p[2])

In [None]:
code_links_read = []
for i in range(len(partitions)):
    with open('links/code_links_part' + str(i) + '.txt', 'r') as code_file_read:
        code_links_read += code_file_read.read().splitlines()

In [None]:
code_links = ['https://www.kaggle.com' + cl for cl in code_links_read]
code_links[:5]

## 3. Pull raw code file to local dir

In [None]:
for code_link in tqdm(code_links):
    !wget --content-disposition $code_link -P data

## 4. Clean non-Python files from downloaded

In [None]:
for filename in os.listdir("data"):
    if filename == ".DS_Store":
        pass
    
    name_parts = filename.split(".")
    if len(name_parts) == 2:
        name_parts.append("0")
        
    if name_parts[1] in ['ipynb', 'py']:
        new_name = name_parts[0] + name_parts[2] + "." + name_parts[1]
        new_name = 'r' + new_name if name_parts[1] == 'py' else new_name
        os.rename('data/' + filename, 'data/' + new_name)

    else:
        os.remove('data/' + filename)

## Tokenize Notebooks and Scripts

In [None]:
# Analyze python script
pd_search_tokens = ["." + p[0] for p in inspect.getmembers(pd) if p[1].__class__.__name__ == 'function'] + ['.DataFrame']
df_search_tokens = ["." + p[0] for p in inspect.getmembers(pd.DataFrame) if p[1].__class__.__name__ == 'function']

def parse_script(script_name):
    with open(script_name) as script_file:
        code = script_file.read().splitlines()

    ps = {}
    for line in code:
        for token in pd_search_tokens:
            ct = line.count(token)
            if ct > 0 and token not in ps:
                ps['pd' + token] = ct
            elif ct > 0:
                ps['pd' + token] += ct
    
        for token in df_search_tokens:
            ct = line.count(token)
            if ct > 0 and token not in ps:
                ps['df' + token] = ct
            elif ct > 0:
                ps['df' + token] += ct
    
    return ps

In [None]:
all_methods = []
for filename in os.listdir("data"):
    if filename.split(".")[1] == 'py':
        all_methods.append(parse_script('data/' + filename))
    else:
        pfilename = 'data/' + filename
        cfilename = filename.split(".")[0] + '.py'
        if cfilename not in os.listdir("data"):
            !jupyter nbconvert --to=python $pfilename
            time.sleep(1)
        try:
            all_methods.append(parse_script('data/' + cfilename))
        except:
            pass

In [86]:
methods_write = open('results/methods.txt', 'w')
for method in all_methods:
    methods_write.write("%s\n" % method)

In [87]:
print(len(all_methods))

932


In [84]:
base = {}
for dict_method in all_methods:
    for k in dict_method:
        if k not in base:
            base[k] = 0 
        base[k] += dict_method[k]

In [88]:
m, c = [], []
for method, count in base.items():
    m.append(method)
    c.append(count)
    
mFreq = pd.DataFrame(data = {'Method' : m, 'Count': c}).set_index('Method').sort_values('Count', ascending=False)
mFreq.to_csv('results/results.csv')
mFreq.head(10)

Unnamed: 0_level_0,Count
Method,Unnamed: 1_level_1
pd.read_csv,756
pd.DataFrame,469
df.head,438
df.append,436
df.mean,429
df.drop,422
df.sum,414
df.ge,401
df.to_csv,378
df.get,376
