# Top Kaggle Libraries
Which libraries are used in Kaggle submissions?

In [140]:
from src.data.utils import *
from glob import glob
from tqdm.notebook import tqdm
from collections import Counter
import dis

import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [6]:
kaggle_competitions = glob("/projects/bdata/datasets/kaggle-competitions/raw/*/")

In [34]:
def get_slug_imports(versions):
    slug_imports = set()
    for version in versions:
        for cell in version["cells"]:
            if cell["cell_type"] == "code":
                try:
                    instructions = dis.get_instructions(cell["source"])
                except (SyntaxError, TypeError):
                    continue
                imports = [__.argval for __ in instructions if 'IMPORT_NAME' in __.opname]
                slug_imports = slug_imports.union(set(imports))
    return slug_imports

In [37]:
library_counts = Counter()
for competition in tqdm(kaggle_competitions):
    comp_reader = CompetitionReader(competition)
    slug_imports = comp_reader.apply_to_slugs(get_slug_imports)
    [library_counts.update(x) for x in slug_imports]

HBox(children=(FloatProgress(value=0.0, max=404.0), HTML(value='')))




In [41]:
library_counts.most_common(10)

[('numpy', 12442),
 ('pandas', 12439),
 ('os', 9171),
 ('matplotlib.pyplot', 7092),
 ('sklearn.model_selection', 6130),
 ('sklearn.metrics', 4561),
 ('seaborn', 4103),
 ('sklearn.preprocessing', 3255),
 ('sklearn', 2570),
 ('sklearn.ensemble', 2527)]

In [42]:
base_lib_counts = Counter()
for k, v in library_counts.items():
    base_lib_counts.update([k.split(".")[0]] * v)

In [51]:
base_lib_counts = pd.DataFrame(base_lib_counts.most_common(50), columns = ["library","slug_count"])

In [52]:
base_lib_counts

Unnamed: 0,library,slug_count
0,sklearn,29947
1,keras,15364
2,pandas,12696
3,numpy,12556
4,matplotlib,9414
5,os,9349
6,tensorflow,5009
7,seaborn,4103
8,torch,3667
9,scipy,2501


In [149]:
base_lib_counts.to_json("/homes/gws/mikeam/RobustDataScience/data/processed/kaggle_most_common_libraries.jsonl",
                        orient = "records", lines = True)

In [55]:
!head -n5 /homes/gws/mikeam/RobustDataScience/data/processed/kaggle_most_common_libraries.jsonl

{"library":"sklearn","slug_count":29947}
{"library":"keras","slug_count":15364}
{"library":"pandas","slug_count":12696}
{"library":"numpy","slug_count":12556}
{"library":"matplotlib","slug_count":9414}


In [86]:
from treelib import Tree, Node
from enum import Enum
import inspect
import pkgutil
import sys

In [143]:
class PackageNode(Enum):
    MODULE = 1
    CLASS = 2
    FUNCTION = 3
    PARAM = 4

In [73]:
PackageNode.MODULE

<PackageNode.MODULE: 1>

In [147]:
def add_param_nodes(fcn,parent,tree):
#     print(fcn,parent)
    try:
        params = inspect.signature(fcn).parameters.values()
    except ValueError:
        return
        
    for param in inspect.signature(fcn).parameters.values():
        if param.kind == param.POSITIONAL_OR_KEYWORD:
            param_id = ".".join([parent,param.name])
            tree.create_node(param.name, param_id, parent=parent, data = PackageNode.PARAM)

package=pd
package_tree = Tree()
package_name = str(pd.__name__)
package_tree.create_node(package_name, package_name,data = PackageNode.MODULE)

for importer, modname, ispkg in pkgutil.walk_packages(path=package.__path__,
                                                       prefix=package.__name__+'.',
                                                        onerror=lambda x: None):
    pkg_name = modname.split(".")[-1]
    pkg_parent = ".".join(modname.split(".")[:-1])
    package_tree.create_node(pkg_name,modname, data = PackageNode.MODULE, parent = pkg_parent)

    #Crawl through the package
    try:
        pkg = sys.modules[modname]
    except KeyError:
        continue
        
    print(modname)
    for name, obj in inspect.getmembers(pkg, lambda x: inspect.isclass(x) or inspect.isfunction(x)):
        pkg_key = ".".join([modname,name])
        
        if inspect.isfunction(obj):

            package_tree.create_node(name, pkg_key, data = PackageNode.FUNCTION, parent = modname)
            add_param_nodes(obj, pkg_key, package_tree)
            
        elif inspect.isclass(obj):
            package_tree.create_node(name, pkg_key , data = PackageNode.CLASS, parent = modname)
            for attr_name in dir(obj):
                attr = getattr(obj, attr_name)

                if callable(attr):
                    if attr_name == "__init__":
                        add_param_nodes(attr, pkg_key, package_tree)
                    elif re.match(r"__.*__",attr_name):
                        continue
                    else:
                        attr_key = ".".join([pkg_key,attr_name])
                        package_tree.create_node(attr_name, attr_key, data = PackageNode.FUNCTION, parent = pkg_key)
                        add_param_nodes(attr,attr_key,package_tree)
                        

pandas._config
pandas._config.config
pandas._config.dates
pandas._config.display
pandas._config.localization
pandas._libs
pandas._libs.algos
pandas._libs.groupby
pandas._libs.hashing
pandas._libs.hashtable
pandas._libs.index
pandas._libs.indexing
pandas._libs.internals
pandas._libs.interval
pandas._libs.join
pandas._libs.json
pandas._libs.lib
pandas._libs.missing
pandas._libs.ops
pandas._libs.ops_dispatch
pandas._libs.parsers
pandas._libs.properties
pandas._libs.reduction
pandas._libs.reshape
pandas._libs.sparse
pandas._libs.testing
pandas._libs.tslib
pandas._libs.tslibs
pandas._libs.tslibs.c_timestamp
pandas._libs.tslibs.ccalendar
pandas._libs.tslibs.conversion
pandas._libs.tslibs.fields
pandas._libs.tslibs.frequencies
pandas._libs.tslibs.nattype
pandas._libs.tslibs.np_datetime
pandas._libs.tslibs.offsets
pandas._libs.tslibs.parsing
pandas._libs.tslibs.period
pandas._libs.tslibs.resolution
pandas._libs.tslibs.strptime
pandas._libs.tslibs.timedeltas
pandas._libs.tslibs.timestamps
panda

DuplicatedNodeIdError: Can't create node with ID 'pandas._testing.DataFrame.copy'

<treelib.tree.Tree at 0x7f60460c0be0>

In [132]:
package_tree.depth()

6

In [141]:
re.match(r"__.*__","__init__")

<re.Match object; span=(0, 8), match='__init__'>