In [2]:
import pandas as pd
import json
import numpy as np

import dask
import dask.bag as db
import dask.dataframe as dd
# from dask.dot import dot_graph
import re
from dask.diagnostics import ProgressBar
import html

import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

import scipy.sparse as sp

In [3]:
from dask.distributed import Client, LocalCluster
# cluster = LocalCluster()
# client = Client(cluster,threads_per_worker = 10)

In [4]:
client = Client()

In [5]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:29269  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 12  Cores: 96  Memory: 3.25 TB


In [6]:

def extract_column_value(line, col_name, cast_type=str):
    pattern_tpl = r'{col}="([^"]*)"'
    pattern = pattern_tpl.format(col=col_name)
    match = re.search(pattern, line)

    if cast_type == int:
        null_value = 0
    else:
        null_value = None

    return cast_type(match[1]) if match is not None else null_value


def extract_comments_columns(line):
    text = extract_column_value(line, 'Text', str)
    row = {
        'id': extract_column_value(line, 'Id', int),
        'post_id':extract_column_value(line, 'PostId', str),
        'text': text ,
        'code_snippets': extract_code(text)
    }
    return row

def explode(df,col):
    return df.explode(col)


In [7]:
def extract_code(text):
    CODE_SNIPPETS_REGEX = r"(?<=<code>)(.*?)(?=<\/code>)"
    return re.findall(CODE_SNIPPETS_REGEX,text)

def extract_posts_columns(line):
    body = html.unescape(extract_column_value(line, 'Body', str))
    row = {
        'id': extract_column_value(line, 'Id', int),
        'post_type': extract_column_value(line, "PostTypeId", int),
        'parent_id' : extract_column_value(line, "ParentId", int),
        'answer_count': extract_column_value(line, 'AnswerCount', int),
        'tags': extract_column_value(line, 'Tags', str),
        'body': body,
        'code_snippets': extract_code(body)
    }
    return row


def is_fruitful_question(line):
    return (line.get("post_type") == 1) and  (line.get("answer_count") > 0) 

def is_response(line):
    return line.get("post_type") == 2

def is_python(line):
    return "python" in line["tags"]

def filter_post_line(line):
    return is_fruitful_question(line) or is_response(line)

In [8]:
posts = db.read_text('/projects/bdata/stackoverflow/stackoverflow/Posts.xml', encoding = 'utf-8',
                           blocksize=10000000)\
                        .filter(lambda line: line.find('<row') >= 0)\
                        .map(extract_posts_columns)\
                        .filter(filter_post_line)\
                        .to_dataframe()\
                        .map_partitions(lambda x: explode(x,"code_snippets"))\
                        .compute()



In [9]:
python_questions = posts[(posts["post_type"] == 1) & (posts["tags"].str.contains("python"))]["id"].drop_duplicates()

In [10]:
python_posts = posts.merge(python_questions , how="right", 
                           left_on = (posts["post_type"] == 1)*(posts["id"]) + (posts["post_type"] != 1)*(posts["parent_id"]),
                           right_on = "id")
python_posts["question_key"] = (python_posts["post_type"] == 1)*(python_posts["id"]) + \
                               (python_posts["post_type"] != 1)*(python_posts["parent_id"])

python_posts = python_posts[["question_key","id","answer_count","body","code_snippets"]]
python_posts = python_posts.dropna(subset = ["code_snippets"])

I don't just want to do the edit distance like we had with Kaggle. Maybe there's something more interesting we can do with co-occurence.

In [285]:
def tokenize_code_snippet(code):
    try:
        no_chars = re.sub('[^a-zA-Z\n]+', ' ', code)
    except TypeError:
        print(code)
    tokens = split_func_name(no_chars)
    return tokens
    
def split_func_name(func):
    """
    split function names
    eg. sklearn.metrics.pairwise.cosine_similarity -> [sklearn, metrics, pairwise, cosine, similarity]
    """
    new_str = ''
    for i, l in enumerate(func):
#         if i > 0 and l.isupper() and func[i - 1].islower():
#             new_str += '.'
        if i > 0 and i < len(func) - 1 and l.isupper() and func[i - 1].isupper() and func[i + 1].islower():
            new_str += '.'
        elif i > 0 and l.isdigit() and func[i - 1].isalpha():
            new_str += '.'
        elif i < len(func) - 1 and l.isalpha() and func[i - 1].isdigit():
            new_str += '.'
        else:
            pass
        new_str += l
    return re.split('\.|_|\s', new_str.lower())

In [294]:
split_func_name("sklearn.OneHotEncoder")

['sklearn', 'onehotencoder']

In [303]:
def cooccurence(X,max_features = int(1e4),diag_norm = False):
    vectorizer = CountVectorizer(max_features=max_features, tokenizer=tokenize_code_snippet)
    vectors = (vectorizer.fit_transform(X) > 0).astype(int)
    res = np.dot(vectors.T,vectors)
    if diag_norm:
        g = sp.diags(1./res.diagonal())
        res = g * res
    return res, vectorizer.vocabulary_

In [288]:
question_code = python_posts.groupby("question_key")['code_snippets'].transform(' '.join).drop_duplicates()

In [289]:
so_code_cooccurence = cooccurence(question_code) 

In [290]:
def display_top_cooccurences(token,matrix,vocab,top_n = 5):
    query_index = vocab.get(token)
    if query_index is None:
        raise KeyError("Not in Vocab")
    else:
        reverse_index = {v: k for k, v in vocab.items()}
        row = matrix[query_index].toarray().flatten()
        top_occ = (-row).argsort()[:top_n]
        
        keys = [reverse_index.get(x) for x in top_occ ] 
        counts = row[top_occ]
        print(counts)
        print(keys)

In [296]:
display_top_cooccurences("onehotencoder",*so_code_cooccurence)

[158  48  45  44  36]
['onehotencoder', 'labelencoder', 'sklearn', '', 'dummies']


In [297]:
display_top_cooccurences("cosine",*so_code_cooccurence,top_n=20)

[140  52  52  49  37  28  26  24  20  19  17  16  16  16  14  14  13  13
  12  12]
['cosine', 'distance', 'similarity', '', 'scipy', 'spatial', 'np', 'a', 'x', 'sklearn', 'pairwise', 'metrics', 'matrix', 'b', 'i', 'd', 'n', 'numpy', 'dot', 'y']


In [299]:
display_top_cooccurences("ols",*so_code_cooccurence)

[105  43  42  39  28]
['ols', 'statsmodels', 'x', 'y', '']


In [301]:
display_top_cooccurences("svm",*so_code_cooccurence,top_n = 20)

[172  67  64  55  35  34  30  27  25  24  18  17  17  15  15  14  14  13
  12  12]
['svm', '', 'sklearn', 'svc', 'predict', 'x', 'fit', 'y', 'clf', 'train', 'model', 'c', 'kernel', 'linearsvc', 'linear', 'scikit', 'libsvm', 'n', 'python', 'learn']


# Top Libraries
Which libraries are most commonly referenced on StackOverflow?

In [20]:
library_regex = re.compile("^\s*(?:from|import)\s+(\w+(?:\s*,\s*\w+)*)")
top_libraries = python_posts["code_snippets"].str.extract(library_regex)

In [24]:
top_libraries[0].value_counts().sort_values(ascending = False)[:30]

__future__    1230
numpy          752
foo            657
sys            565
django         555
module         551
os             532
tkinter        501
matplotlib     383
pandas         303
datetime       302
package        283
math           275
x              255
a              241
random         239
tensorflow     238
app            234
time           230
re             226
pdb            217
cv2            216
scipy          202
Tkinter        174
sklearn        171
pygame         170
PIL            159
json           152
urllib         150
this           148
Name: 0, dtype: int64