In [3]:
import requests
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import json
from time import sleep, perf_counter as pf
import re

def raze_list(l):
    out = []
    for x in l:
        if type(x) == list:
            out += raze_list(x)
        else:
            out.append(x)
    return out

Here we begin at the most recent Reddit [r/datascience](https://old.reddit.com/r/datascience) subreddit weekly discussion on entering/transitioning in the field.  We're looking for keywords mentioned in the discussion to give an indication of the languages and technologies that data scientists know and use.  We know that Reddit's [demographic](https://old.reddit.com/r/dataisbeautiful/comments/5700sj/octhe_results_of_the_reddit_demographics_survey/) skews towards North American males under 30, so it should be somewhat representative of the kinds of recruits ISED is targeting (with the exception of the male bias).

In [4]:
start_link = 'https://old.reddit.com/r/datascience/comments/9meyte/weekly_entering_transitioning_thread_questions/'

In [5]:
#criteria functions for finding elements in the webpages
def post(tag):
    return tag.name == 'div' and tag.has_attr('class') and 'expando' in tag['class']
def link_to_follow(tag):
    return tag.name == 'a' and tag.has_attr('href')

def is_comment(tag):
    return tag.name == 'div' and tag.has_attr('class') and max(['entry' in x for x in tag['class']])
def comment_text(tag):
    return tag.name == 'p' and not tag.has_attr('class')

In [6]:
#getting all the text from all the reddit discussions
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
i = 0
link = start_link
text = ''
while link is not None and i < 50:
    #print(i, end=', ')
    #print(link)
    r = requests.get(link, headers = header)
    t1 = pf()
    page = BeautifulSoup(r.text, 'html.parser')
    p = raze_list([list(x.find_all(comment_text)) for x in page.find_all(is_comment)])
    text += '\n'.join(raze_list([list(x.find_all(text=True)) for x in p]))
    next_link = page.find(post)
    if next_link is not None:
        next_link = next_link.find(link_to_follow)
        if next_link is not None:
            link = next_link['href'].replace('www', 'old')
        else:
            link = next_link
    else:
        link = next_link
    i += 1
    sleep(max(2 - pf() + t1, 0))

Out of the 50 weekly discussions parsed, these are the results for the frequency of the language/technology keywords found in these discussions, as well as the regular expressions they matched.

In [8]:
lang = {'python':re.compile('python'), 'r': re.compile('\\br\\b'), 'sql':re.compile('sql'),\
       'sas':re.compile('\\bsas\\b|statistical analysis system'), 'excel': re.compile('\\bexcel\\b'), \
        'spss':re.compile('spss|statistical package for (the ){0,1}social science'), \
       'hadoop':re.compile('hadoop'), 'kibana':re.compile('kibana'), 'tableau':re.compile('tableau'), \
       'pig':re.compile('\\bpig\\b'), 'stata':re.compile('\\bstata\\b'), 'powerbi':re.compile('power bi'), \
       'java':re.compile('java'), 'c/c++':re.compile('\\bc(\\+\\+){0,1}\\b'), 'hive':re.compile('\\bhive\\b'), \
       'matlab':re.compile('matlab'), 'ruby':re.compile('ruby'), 'perl':re.compile('perl'), 'hbase':re.compile('hbase'), \
       'spark':re.compile('spark'), 'php':re.compile('\\bphp\\b'), 'scala':re.compile('\\bscala\\b'), \
        'tensorflow':re.compile('tensor( ){0,1}flow'), 'pytorch':re.compile('py( ){0,1}torch')}

text = text.lower()

for key in lang:
    lang[key] = (lang[key], len(re.findall(lang[key], text)))
    
lang

{'python': (re.compile(r'python', re.UNICODE), 613),
 'r': (re.compile(r'\br\b', re.UNICODE), 466),
 'sql': (re.compile(r'sql', re.UNICODE), 327),
 'sas': (re.compile(r'\bsas\b|statistical analysis system', re.UNICODE), 36),
 'excel': (re.compile(r'\bexcel\b', re.UNICODE), 130),
 'spss': (re.compile(r'spss|statistical package for (the ){0,1}social science',
  re.UNICODE),
  10),
 'hadoop': (re.compile(r'hadoop', re.UNICODE), 21),
 'kibana': (re.compile(r'kibana', re.UNICODE), 0),
 'tableau': (re.compile(r'tableau', re.UNICODE), 63),
 'pig': (re.compile(r'\bpig\b', re.UNICODE), 1),
 'stata': (re.compile(r'\bstata\b', re.UNICODE), 7),
 'powerbi': (re.compile(r'power bi', re.UNICODE), 11),
 'java': (re.compile(r'java', re.UNICODE), 71),
 'c/c++': (re.compile(r'\bc(\+\+){0,1}\b', re.UNICODE), 72),
 'hive': (re.compile(r'\bhive\b', re.UNICODE), 5),
 'matlab': (re.compile(r'matlab', re.UNICODE), 32),
 'ruby': (re.compile(r'ruby', re.UNICODE), 1),
 'perl': (re.compile(r'perl', re.UNICODE), 20

Out of the 50 weekly discussions parsed, these are the results for the frequency of the social media and employment website names found in these discussions, as well as the regular expressions they matched.  It is interesting to note that [Indeed](https://www.indeed.ca/) seems to be the most popularly mentioned employment site.

In [12]:
social_media = {'linkedin':re.compile('linked( ){0,1}in'), 'facebook':re.compile('facebook'), \
               'indeed':re.compile('indeed'), 'glassdoor':re.compile('glass( ){0,1}door'), \
               'monster':re.compile('monster'), 'workopolis':re.compile('workopolis'), \
               'kijiji':re.compile('kijiji'), 'craigslist':re.compile("craig'{0,1}s {0,1}list")}

text = text.lower()

for key in social_media:
    social_media[key] = (social_media[key], len(re.findall(social_media[key], text)))
    
social_media

{'linkedin': (re.compile(r'linked( ){0,1}in', re.UNICODE), 45),
 'facebook': (re.compile(r'facebook', re.UNICODE), 11),
 'indeed': (re.compile(r'indeed', re.UNICODE), 12),
 'glassdoor': (re.compile(r'glass( ){0,1}door', re.UNICODE), 6),
 'monster': (re.compile(r'monster', re.UNICODE), 1),
 'workopolis': (re.compile(r'workopolis', re.UNICODE), 0),
 'kijiji': (re.compile(r'kijiji', re.UNICODE), 0),
 'craigslist': (re.compile(r"craig'{0,1}s {0,1}list", re.UNICODE), 0)}

Saving to .csv.

In [45]:
out_lang = [(key,lang[key][1]) for key in lang]
out_sm = [(key,social_media[key][1]) for key in social_media]

In [46]:
out_lang = pd.DataFrame(out_lang, columns = ['lang', 'freq'])
out_lang.to_csv('reddit_lang.csv', index=False)

In [47]:
out_sm = pd.DataFrame(out_sm, columns = ['site', 'freq'])
out_sm.to_csv('reddit_sm.csv', index=False)