In [1]:
import requests
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import json
from time import sleep, perf_counter as pf
import re

def raze_list(l):
    out = []
    for x in l:
        if type(x) == list:
            out += raze_list(x)
        else:
            out.append(x)
    return out

Here we are starting our web scraping at the main forum of [Data Science Central](https://www.datasciencecentral.com/) to search for technologies and languages as well as social media and employment sites being discussed.

In [2]:
next_link = 'https://www.datasciencecentral.com/forum'

In [3]:
def p_with_link(tag):
    return (tag.name == 'p' or tag.name == 'h3') and tag.find('a') is not None
def get_link(tag):
    return tag.name == 'a' and tag.has_attr('href')
def get_next(tag):
    return tag.name == 'a' and tag.has_attr('href') and 'next' in tag.text.lower()

In [4]:
i = 0
links = []
while next_link is not None and i < 50:
    r = requests.get(next_link)
    page = BeautifulSoup(r.text, 'html.parser')
    tmp = raze_list([list(x.find('tbody').find_all('tr')) for x in page.find_all('table', attrs={'class':'categories'}) \
                      if x.find('tbody') is not None])
    links += [x.find(p_with_link).find(get_link)['href'] for x in tmp if x.find(p_with_link) is not None]

    tmp = page.find('ul', attrs={'class':'pagination easyclear '})
    if tmp is not None:
        tmp = tmp.find(get_next)
        if tmp is not None:
            next_link = tmp['href']
    i += 1

The number of posts analysed.

In [5]:
len(links)

503

In [6]:
text = ''
for link in links:
    page = BeautifulSoup(requests.get(link).text, 'html.parser')
    for script in page(["script", "style"]):
        script.decompose()   
    text += page.get_text()

This is a list of the languages and technologies and the frequencies with which they were mentioned in the forum.  It should be noted that the forum specializes in big data, and the website proper specifically mentions Hadoop, so its popularity in discussion is unsurprising.  Alongside are the regular expressions used to match the keywords.

In [7]:
lang = {'python':re.compile('python'), 'r': re.compile('\\br\\b'), 'sql':re.compile('sql'),\
       'sas':re.compile('\\bsas\\b|statistical analysis system'), 'excel': re.compile('\\bexcel\\b'), \
        'spss':re.compile('spss|statistical package for (the ){0,1}social science'), \
       'hadoop':re.compile('hadoop'), 'kibana':re.compile('kibana'), 'tableau':re.compile('tableau'), \
       'pig':re.compile('pig'), 'stata':re.compile('\\bstata\\b'), 'powerbi':re.compile('power bi'), \
       'java':re.compile('java'), 'c/c++':re.compile('\\bc(\\+\\+){0,1}\\b'), 'hive':re.compile('hive'), \
       'matlab':re.compile('matlab'), 'ruby':re.compile('ruby'), 'perl':re.compile('perl'), 'hbase':re.compile('hbase'), \
       'spark':re.compile('spark'), 'php':re.compile('\\bphp\\b'), 'scala':re.compile('\\bscala\\b'), \
        'tensorflow':re.compile('tensor( ){0,1}flow'), 'pytorch':re.compile('py( ){0,1}torch')}

text = text.lower()

for key in lang:
    lang[key] = (lang[key], len(re.findall(lang[key], text)))
    
lang

{'python': (re.compile(r'python', re.UNICODE), 439),
 'r': (re.compile(r'\br\b', re.UNICODE), 463),
 'sql': (re.compile(r'sql', re.UNICODE), 152),
 'sas': (re.compile(r'\bsas\b|statistical analysis system', re.UNICODE), 76),
 'excel': (re.compile(r'\bexcel\b', re.UNICODE), 90),
 'spss': (re.compile(r'spss|statistical package for (the ){0,1}social science',
  re.UNICODE),
  6),
 'hadoop': (re.compile(r'hadoop', re.UNICODE), 780),
 'kibana': (re.compile(r'kibana', re.UNICODE), 1),
 'tableau': (re.compile(r'tableau', re.UNICODE), 22),
 'pig': (re.compile(r'pig', re.UNICODE), 22),
 'stata': (re.compile(r'\bstata\b', re.UNICODE), 1),
 'powerbi': (re.compile(r'power bi', re.UNICODE), 4),
 'java': (re.compile(r'java', re.UNICODE), 685),
 'c/c++': (re.compile(r'\bc(\+\+){0,1}\b', re.UNICODE), 178),
 'hive': (re.compile(r'hive', re.UNICODE), 49),
 'matlab': (re.compile(r'matlab', re.UNICODE), 22),
 'ruby': (re.compile(r'ruby', re.UNICODE), 5),
 'perl': (re.compile(r'perl', re.UNICODE), 49),
 'h

This is a list of the social media and employement sites mentioned in the forum alongside their frequency and the regular expression they matched.  It should be noted that each post contains buttons to link to Facebook, Twitter, and Google+ (not analysed), so their relatively high frequency should not be given much weight.

In [15]:
social_media = {'linkedin':re.compile('linked( ){0,1}in'), 'facebook':re.compile('facebook'), \
               'indeed':re.compile('indeed'), 'glassdoor':re.compile('glass( ){0,1}door'), \
               'monster':re.compile('monster'), 'workopolis':re.compile('workopolis'), \
               'twitter':re.compile('twitter|tweet'), \
               'kijiji':re.compile('kijiji'), 'craigslist':re.compile("craig'{0,1}s {0,1}list")}

text = text.lower()

for key in social_media:
    social_media[key] = (social_media[key], len(re.findall(social_media[key], text)))
    
social_media

{'linkedin': (re.compile(r'linked( ){0,1}in', re.UNICODE), 43),
 'facebook': (re.compile(r'facebook', re.UNICODE), 538),
 'indeed': (re.compile(r'indeed', re.UNICODE), 35),
 'glassdoor': (re.compile(r'glass( ){0,1}door', re.UNICODE), 2),
 'monster': (re.compile(r'monster', re.UNICODE), 1),
 'workopolis': (re.compile(r'workopolis', re.UNICODE), 0),
 'twitter': (re.compile(r'twitter|tweet', re.UNICODE), 619),
 'kijiji': (re.compile(r'kijiji', re.UNICODE), 0),
 'craigslist': (re.compile(r"craig'{0,1}s {0,1}list", re.UNICODE), 1)}

Saving to .csv.

In [9]:
out_lang = [(key,lang[key][1]) for key in lang]
out_sm = [(key,social_media[key][1]) for key in social_media]

In [10]:
out_lang = pd.DataFrame(out_lang, columns = ['lang', 'freq'])
out_lang.to_csv('dsf_lang.csv', index=False)

In [11]:
out_sm = pd.DataFrame(out_sm, columns = ['site', 'freq'])
out_sm.to_csv('dsf_sm.csv', index=False)