The purpose of this notebook is to 
1. collect the profile of every affiliate of RC from the directory
2. determine the date of their most recent stint
3. filter for people with data science interests
4. store this dataframe representing these people in a pickle

This pickle will be used as the database of the recurser suggester script

In [269]:
import requests
import json
from urllib.parse import urlencode
from time import sleep
import pandas as pd
import time
import re
import os
"""
this represents how nice the scriptrunner is (that's you!)
The higher, the better!
It is also used to space out requests made to the RC directory service.
Do a 10 minute meditation when you run collect_full_results()
"""
SCRIPTRUNNER_NICENESS_VALUE_IN_SECONDS = 1.5

In [143]:
def collect_full_results():
    total_num_profiles = 1757
    url_base = "https://www.recurse.com/api/v1/profiles?"
    auth_token = os.env['RECURSE_AUTH_TOKEN']
    headers = {'Authorization': 'Bearer ' + auth_token}
    offset = 0
    limit = 15
    full_results = []
    while len(full_results) < total_num_profiles:
        sleep(SCRIPTRUNNER_NICENESS_VALUE_IN_SECONDS)
        query_string = urlencode(dict(offset=offset, limit=limit))
        url = url_base + query_string
        response = requests.get(url, headers=headers)
        full_results.extend(response.json())
        offset += limit
    return full_results

In [144]:
full_results = collect_full_results()

In [310]:
df = pd.DataFrame(full_results)

In [312]:
def most_recent_attend(row):
    stints = row['stints']
    if not stints:
        return None
    most_recent_start_date = stints[0]['start_date']
    if len(stints) == 1:
        return most_recent_start_date
    for stint in stints[1:]:
        curr_start_date = stint['start_date']
        most_recent_start_date = max(most_recent_start_date, curr_start_date)
    return most_recent_start_date

In [311]:
df['most_recent_date'] = pd.to_datetime(
    df.apply(lambda row: most_recent_attend(row), axis=1)
)

In [357]:
data_sci_prog = re.compile(
    r'(data scien)|((machine|deep) learning)|(nlp)|(computer vision)|(natural language processing)|(ml)',
    flags=re.IGNORECASE
)
def filter_for_keywords(*values):
    return any(
        bool(data_sci_prog.search(value))
        for value in values
    )

In [358]:
keyword_filtered_df = df[
    df[['before_rc_rendered', 'interests_rendered', 'bio_rendered', 'during_rc_rendered']]
    .apply(
        lambda values: filter_for_keywords(*values), axis=1
    )
]

In [359]:
small_df = keyword_filtered_df[['name', 'slug', 'most_recent_date']]
small_df['rating'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [361]:
small_df.to_pickle('ratings.pickle')