In [29]:
import string

import numpy as np
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sphinx.util import requests


In [17]:
def get_and_clean_data():
    data = pd.read_csv('/Users/bill/IR/resource/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('','',string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate( str.maketrans(string.whitespace, ' ' * len(string.whitespace),'')))
    cleaned_description = cleaned_description.drop_duplicates()
    return  cleaned_description

In [18]:
get_and_clean_data()

0       the chosen sr software developer will be part ...
1       position c lead software developer location mi...
2       senior software developer hoboken nj starts as...
3       our client a multinational publishing and educ...
4       position c lead software developer location ph...
                              ...                        
9991    position description  position description  cg...
9994    job description  researches designs develops a...
9997    job description  the candidate must be experie...
9998    please only apply if you do not need sponsorsh...
9999    company information  solid reputation passiona...
Name: job_description, Length: 7583, dtype: object

In [19]:
def simple_tokenize(data):
    cleaned_description = data.apply(lambda s: [x.strip() for x in s.split()])
    return cleaned_description

In [20]:
simple_tokenize(data=get_and_clean_data())

0       [the, chosen, sr, software, developer, will, b...
1       [position, c, lead, software, developer, locat...
2       [senior, software, developer, hoboken, nj, sta...
3       [our, client, a, multinational, publishing, an...
4       [position, c, lead, software, developer, locat...
                              ...                        
9991    [position, description, position, description,...
9994    [job, description, researches, designs, develo...
9997    [job, description, the, candidate, must, be, e...
9998    [please, only, apply, if, you, do, not, need, ...
9999    [company, information, solid, reputation, pass...
Name: job_description, Length: 7583, dtype: object

In [21]:
def parse_job_description():
    cleaned_description = get_and_clean_data()
    cleaned_description = simple_tokenize(cleaned_description)
    return cleaned_description

In [22]:
parse_job_description()

0       [the, chosen, sr, software, developer, will, b...
1       [position, c, lead, software, developer, locat...
2       [senior, software, developer, hoboken, nj, sta...
3       [our, client, a, multinational, publishing, an...
4       [position, c, lead, software, developer, locat...
                              ...                        
9991    [position, description, position, description,...
9994    [job, description, researches, designs, develo...
9997    [job, description, the, candidate, must, be, e...
9998    [please, only, apply, if, you, do, not, need, ...
9999    [company, information, solid, reputation, pass...
Name: job_description, Length: 7583, dtype: object

In [23]:
def count_python_mysql():
    parsed_description = parse_job_description()
    count_python = parsed_description.apply(lambda s: 'python' in s).sum()
    count_mysql = parsed_description.apply(lambda s: 'mysql' in s).sum()
    print('python: ' + str(count_python) + ' of ' + str(parsed_description.shape[0]))
    print('mysql: ' + str(count_mysql) + ' of ' + str(parsed_description.shape[0]))

In [24]:
count_python_mysql()

python: 1379 of 7583
mysql: 667 of 7583


In [25]:
def parse_db():
    html_doc = requests.get("https://db-engines.com/en/ranking").content
    soup = BeautifulSoup(html_doc, 'html.parser')
    db_table = soup.find("table", {"class": "dbi"})
    all_db = [''.join(s.find('a').findAll(text=True,recursive=False)).strip() for s in
db_table.findAll("th", {"class": "pad-l"})]
    all_db = list(dict.fromkeys(all_db))
    db_list = all_db[:10]
    db_list = [s.lower() for s in db_list]
    db_list = [[x.strip() for x in s.split()] for s in db_list]
    return db_list

In [26]:
parse_db()

  all_db = [''.join(s.find('a').findAll(text=True,recursive=False)).strip() for s in


[['oracle'],
 ['mysql'],
 ['microsoft', 'sql', 'server'],
 ['postgresql'],
 ['mongodb'],
 ['redis'],
 ['elasticsearch'],
 ['ibm', 'db2'],
 ['sqlite'],
 ['microsoft', 'access']]

In [30]:
cleaned_db = parse_db()
parsed_description = parse_job_description()
raw = [None] *  len(cleaned_db)
for i, db in enumerate(cleaned_db):
    raw[i] = parsed_description.apply(lambda s : np.all([x in s for x in db])).sum()
    print(' '.join(db) + ': ' + str(raw[i]) + ' of ' + str(parsed_description.shape[0]))

  all_db = [''.join(s.find('a').findAll(text=True,recursive=False)).strip() for s in


oracle: 1392 of 7583
mysql: 667 of 7583
microsoft sql server: 868 of 7583
postgresql: 261 of 7583
mongodb: 296 of 7583
redis: 106 of 7583
elasticsearch: 161 of 7583
ibm db2: 48 of 7583
sqlite: 28 of 7583
microsoft access: 256 of 7583


In [33]:
 with_python = [None] * len(cleaned_db)
for i , db in enumerate(cleaned_db) :
    with_python[i] = parsed_description.apply(lambda s: np.all([x in s for x in db]) and 'python'in s).sum()
    print(' '.join(db) + ' + python: ' + str(with_python[i]) + ' of ' +
str(parsed_description.shape[0]))

oracle + python: 243 of 7583
mysql + python: 207 of 7583
microsoft sql server + python: 51 of 7583
postgresql + python: 90 of 7583
mongodb + python: 111 of 7583
redis + python: 38 of 7583
elasticsearch + python: 73 of 7583
ibm db2 + python: 12 of 7583
sqlite + python: 7 of 7583
microsoft access + python: 28 of 7583


In [34]:
for i, db in enumerate(cleaned_db):
    print(' '.join(db) + ' + python: ' + str(with_python[i]) + ' of ' + str(raw[i]) + ' (' +
str(np.around(with_python[i] / raw[i]*100,2)) + '%)')

oracle + python: 243 of 1392 (17.46%)
mysql + python: 207 of 667 (31.03%)
microsoft sql server + python: 51 of 868 (5.88%)
postgresql + python: 90 of 261 (34.48%)
mongodb + python: 111 of 296 (37.5%)
redis + python: 38 of 106 (35.85%)
elasticsearch + python: 73 of 161 (45.34%)
ibm db2 + python: 12 of 48 (25.0%)
sqlite + python: 7 of 28 (25.0%)
microsoft access + python: 28 of 256 (10.94%)


In [38]:
data = pd.read_csv('/Users/bill/IR/resource/software_developer_united_states_1971_20191023_1.csv')
description = data['job_description']
cleaned_description = description.apply(lambda s: s.translate(str.maketrans('','',string.punctuation + u'\xa0')))
print(description.head().to_markdown())

|    | job_description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [39]:
cleaned_description

0       The chosen Sr Software Developer will be part ...
1       Position C Lead Software Developer Location Mi...
2       Senior Software Developer Hoboken NJ Starts as...
3       Our client a multinational publishing and educ...
4       Position C Lead Software Developer Location Ph...
                              ...                        
9995    Software Developer – Asheville NC\n\nPosition ...
9996    Business Group Highlights\n\nCivilian State an...
9997    Job Description\n\nThe candidate must be exper...
9998    PLEASE ONLY APPLY IF YOU DO NOT NEED SPONSORSH...
9999    Company Information\n\nSolid reputation passio...
Name: job_description, Length: 10000, dtype: object