This notebook seeks to explore the gender diversity of the different apache projects & the process

In [229]:
from pyspark import *
from pyspark.sql import *
from pyspark.sql.functions import explode
from pyspark.sql.types import *

import json
import os
import meetup.api
from copy import copy
import time


API key configuration

In [230]:
meetup_key = os.getenv("MEETUP_APIKEY")

In [2]:
session = SparkSession.builder.appName("whatCanWeLearnFromTheSixties").getOrCreate()
sc = session.sparkContext

The first thing we want to get is the committers and PMC members, this information is stored in LDAP but also available in JSON. Eventually we will want to enrich this with mailing list information

In [105]:
def loadFlatJsonFile(path, explodeKey, schema=None):
    """Load a flat multi-line json file and convert into Spark & explode"""
    rdd = sc.wholeTextFiles(path).values()
    df = (session.read.schema(schema)
            .json(rdd))
    return df.select(explode(explodeKey))

In [106]:
apache_people_schema = StructType([StructField("lastCreateTimestamp", StringType()),
                     StructField("people",
                                 MapType(StringType(), MapType(StringType(), StringType())))])
apache_people_df = loadFlatJsonFile(path="http_data_sources/public_ldap_people.json", # http://people.apache.org/public/public_ldap_people.json
                                 explodeKey="people", schema=apache_people_schema)

In [213]:
def project_on_github(project):
    """Returns if a project is on github"""
    import urllib3
    http = urllib3.PoolManager()
    r = http.request('GET', "https://github.com/apache/{0}".format(project))
    return r.status == 200
session.catalog.registerFunction("on_github", project_on_github, BooleanType())
# Except I'm a bad person so....
from pyspark.sql.catalog import UserDefinedFunction
project_on_github_udf = UserDefinedFunction(project_on_github, BooleanType(), "on_github")
session.catalog._jsparkSession.udf().registerPython("on_github", udf._judf)

In [216]:
apache_committees_schema = StructType([StructField("lastCreateTimestamp", StringType()),
                     StructField("committees",
                                 MapType(StringType(), MapType(StringType(), StringType())))])
apache_committees_df = loadFlatJsonFile(path="http_data_sources/public_ldap_committees.json", # http://people.apache.org/public/public_ldap_people.json
                                 explodeKey="committees", schema=apache_committees_schema)
apache_committees_on_github_df = apache_committees_df.filter(project_on_github_udf(apache_committees_df.key))
committee_names_df = apache_committees_on_github_df.select(apache_committees_df.key.alias("project"))
committee_names_df.cache()
committee_names_df.count()

[Row(project='abdera'),
 Row(project='accumulo'),
 Row(project='ace'),
 Row(project='activemq'),
 Row(project='airavata'),
 Row(project='allura'),
 Row(project='ambari'),
 Row(project='ant'),
 Row(project='any23'),
 Row(project='apr'),
 Row(project='archiva'),
 Row(project='aries'),
 Row(project='arrow'),
 Row(project='asterixdb'),
 Row(project='aurora'),
 Row(project='avro'),
 Row(project='bahir'),
 Row(project='beam'),
 Row(project='bigtop'),
 Row(project='bloodhound'),
 Row(project='bookkeeper'),
 Row(project='brooklyn'),
 Row(project='buildr'),
 Row(project='bval'),
 Row(project='calcite'),
 Row(project='camel'),
 Row(project='cassandra'),
 Row(project='cayenne'),
 Row(project='celix'),
 Row(project='chemistry'),
 Row(project='chukwa'),
 Row(project='clerezza'),
 Row(project='climate'),
 Row(project='cloudstack'),
 Row(project='cocoon'),
 Row(project='couchdb'),
 Row(project='crunch'),
 Row(project='ctakes'),
 Row(project='curator'),
 Row(project='cxf'),
 Row(project='deltacloud'),

Attempt to fetch relevant past & present meetups for each project - idea based on the listing at https://www.apache.org/events/meetups.html but different code

In [200]:
logger = logging.getLogger()
logger.setLevel("WARN")
def lookup_relevant_meetup(project_name):
    """Lookup relevant meetups for a specific project."""
    import logging
    import time
    import meetup.api
    logger = logging.getLogger()
    meetup_delay = 30
    meetup_reset_delay = 1800 # 30 minutes
    standard_keys = {"text_format": "plain", "trending": "desc=true", "and_text": "true", "city": "san francisco", "country": "usa", "text": "apache " + project_name, "radius": 10000}
    results = {"upcoming": [], "past": []}
    for status in ["upcoming", "past"]:
        keys = copy(standard_keys)
        keys["status"] = status
        count = 200
        base = 0
        while (count == 200):
            logging.debug("Fetch {0} meetups for {1} on base {2}".format(status, project_name, base))
            project_name = "spark"
            client = client = meetup.api.Client(meetup_key)
            if base > 0:
                keys["page"] = base
            # Manually sleep for meetup_reset_delay on failure, the meetup-api package retry logic sometimes breaks :(
            response = None
            retry_count = 0
            while response is None and retry_count < 10:
                try:
                    response = client.GetOpenEvents(**keys)
                except:
                    response = None
                    retry_count += 1
                    time.sleep(meetup_reset_delay)
                    try:
                        response = client.GetOpenEvents(**keys)
                    except:
                        response = None
            try:
                count = response.meta['count']
                base = base + count
                results[status].append(response.results)
                time.sleep(meetup_delay)
            except:
                count = 0
    return (project_name, results)

project_meetups_df = committee_names_df.rdd.map(lambda x: x.project).map(lookup_relevant_meetup)

In [201]:
project_meetups_df.cache()
project_meetups_df.take(1)

KeyboardInterrupt: 

For the provided projects attempt to lookup their GitHub

In [203]:
def lookup_project_git(project):
    """Returns the project github for a specific project. Assumes project is git hosted"""
    return "git://git.apache.org/{0}".format(project)
    

In [221]:
def fetch_project_github_data(project):
    from perceval.backends.core.github import GitHub as perceval_github
    gh_backend = perceval_github(owner="apache", repository=project)
    return list(gh_backend.fetch())

In [222]:
def fetch_project_git_data(project):
    project_git = lookup_project_git(project)
    from perceval.backends.core.git import Git as perceval_git
    git_backend = perceval_git(owner="apache", repository=project)
    return list(gh_backend.fetch())

In [274]:
def lookup_crunchbase_gender(name, projects):
    """Lookup a person a crunch base and see what the gender is.
    Filter for at least one mention of their projects."""
    return []
    from bs4 import BeautifulSoup
    import requests
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
              'Aceept-Language': 'en-US,en;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
    url = "https://www.crunchbase.com/person/{0}".format(name.replace(" ", "-"))
    r =  requests.get(url, headers=headers)
    print(r.__dict__)
    if r.status_code == 200:
        lower_text = r.text.lower()
        if any(project.lower() in lower_text for project in projects):
            soup = BeautifulSoup(r.text)
            return soup
    else:
        return []

In [272]:
result = lookup_crunchbase_gender("holden karau", ["spark"])

{'reason': 'OK', '_content': b'<!DOCTYPE html>\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n<head>\n<META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW">\n<meta http-equiv="cache-control" content="max-age=0" />\n<meta http-equiv="cache-control" content="no-cache" />\n<meta http-equiv="expires" content="0" />\n<meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT" />\n<meta http-equiv="pragma" content="no-cache" />\n<meta http-equiv="refresh" content="10; url=/distil_r_blocked.html?Ref=/person/holden-karau&amp;distil_RID=6BD83046-0052-11E7-BD07-BFFF6FBEFFD5&amp;distil_TID=20170303204605" />\n<script type="text/javascript">\n\t(function(window){\n\t\ttry {\n\t\t\tif (typeof sessionStorage !== \'undefined\'){\n\t\t\t\tsessionStorage.setItem(\'distil_referrer\', document.referrer);\n\t\t\t}\n\t\t} catch (e){}\n\t})(window);\n</script>\n<script type="text/javascript" id="d__inj" class="d__inj_delayed" src="/cb-scfwqaqwuewwr.js" defer></script><style type="text/css">#d__fFH{position:



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [273]:
result

<!DOCTYPE html>
<html><head>
<meta content="NOINDEX, NOFOLLOW" name="ROBOTS"/>
<meta content="max-age=0" http-equiv="cache-control"/>
<meta content="no-cache" http-equiv="cache-control"/>
<meta content="0" http-equiv="expires"/>
<meta content="Tue, 01 Jan 1980 1:00:00 GMT" http-equiv="expires"/>
<meta content="no-cache" http-equiv="pragma"/>
<meta content="10; url=/distil_r_blocked.html?Ref=/person/holden-karau&amp;distil_RID=6BD83046-0052-11E7-BD07-BFFF6FBEFFD5&amp;distil_TID=20170303204605" http-equiv="refresh"/>
<script type="text/javascript">
	(function(window){
		try {
			if (typeof sessionStorage !== 'undefined'){
				sessionStorage.setItem('distil_referrer', document.referrer);
			}
		} catch (e){}
	})(window);
</script>
<script class="d__inj_delayed" defer="" id="d__inj" src="/cb-scfwqaqwuewwr.js" type="text/javascript"></script><style type="text/css">#d__fFH{position:absolute;top:-5000px;left:-5000px}#d__fF{font-family:serif;font-size:200px;visibility:hidden}#yedartyuwcvd{disp

In [267]:
result.find_all('meta')

[<meta content="NOINDEX, NOFOLLOW" name="ROBOTS"/>,
 <meta content="max-age=0" http-equiv="cache-control"/>,
 <meta content="no-cache" http-equiv="cache-control"/>,
 <meta content="0" http-equiv="expires"/>,
 <meta content="Tue, 01 Jan 1980 1:00:00 GMT" http-equiv="expires"/>,
 <meta content="no-cache" http-equiv="pragma"/>,
 <meta content="10; url=/distil_r_blocked.html?Ref=/person/holden-karau&amp;distil_RID=44D14A96-0052-11E7-9CC3-CDECCBF192E4&amp;distil_TID=20170303204459" http-equiv="refresh"/>]