In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def search_df(location, salary):
    df = pd.DataFrame()
    pd.set_option('max_colwidth', 500)
    
    base_url = 'http://www.indeed.com/jobs?q=data+scientist'
    sal_url = '&salary=' + salary
    loc_url = '&l=' + location
    end_url = '&jt=fulltime&sort=date&start='
    for page in range(1,41):
        page = (page-1) * 10
        url = "%s%s%s%s%d" % (base_url, sal_url, loc_url, end_url, page)

        contents = BeautifulSoup(requests.get(url).text, 'lxml')    

        contentsElements = contents.find_all('div', attrs={'class' : '  row  result'}) 

        for elem in contentsElements: 
            try:
                company = elem.find('span', attrs={'itemprop':'name'}).getText().strip()
            except AttributeError:
                company = '-'
            job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
            home_url = "http://www.indeed.com"
            job_link = "%s%s" % (home_url,elem.find('a').get('href'))
            job_addr = elem.find('span', attrs={'itemprop':'addressLocality'}).getText()
            job_posted = elem.find('span', attrs={'class': 'date'}).getText()
            description = elem.find('span', attrs={'class': 'summary'}).getText().strip()
            #try:
            #    salary = elem.find('nobr').getText()
            #except AttributeError:
            #    salary = None

            df = df.append({'company': company, 
                            'job_title': job_title, 
                            'job_link': job_link, 
                            #'job_posted': job_posted,
                            'job_location': job_addr,
                            #'salary': salary,
                            'description': description
                           }, ignore_index=True)

        lastElement = contents.find('div', attrs={'class' : 'lastRow  row  result'})
        try:
            company = lastElement.find('span', attrs={'itemprop':'name'}).getText().strip()
        except AttributeError:
            company = '-'
        job_title = lastElement.find('a', attrs={'class':'turnstileLink'}).attrs['title']
        home_url = "http://www.indeed.com"
        job_link = "%s%s" % (home_url,lastElement.find('a').get('href'))
        job_addr = lastElement.find('span', attrs={'itemprop':'addressLocality'}).getText()
        job_posted = lastElement.find('span', attrs={'class': 'date'}).getText()
        description = lastElement.find('span', attrs={'class': 'summary'}).getText().strip()
        #try:
        #    salary = lastElement.find('nobr').getText()
        #except AttributeError:
        #    salary = None

        df = df.append({'company': company, 
                        'job_title': job_title, 
                        'job_link': job_link, 
                        #'job_posted': job_posted,
                        'job_location': job_addr,
                        #'salary': salary,
                        'description': description
                        }, ignore_index=True)
    df['salary'] = salary
    return df.drop_duplicates()


In [3]:
salary_bins = {1: '$70,000-$90,000',
               2: '$90,000-$110,000',
               3: '$110,000-$130,000',
               4: '$130,000'}

df1 = search_df('san francisco', salary_bins[1])
df2 = search_df('san francisco', salary_bins[2])
df3 = search_df('san francisco', salary_bins[3])
df4 = search_df('san francisco', salary_bins[4])

bigdf = pd.concat([df1,df2,df3,df4])
bigdf.salary.value_counts()

$110,000-$130,000    400
$90,000-$110,000     323
$70,000-$90,000      301
$130,000             301
Name: salary, dtype: int64

In [16]:
import numpy as np
inv_map = {v: k for k, v in salary_bins.items()}
bigdf['salary_bins'] = bigdf.salary.map(inv_map)


In [19]:
bigdf.head(20)

Unnamed: 0,company,description,job_link,job_location,job_title,salary,salary_bins
0,Uber,"BA/BS/MS in Math, Economics, Statistics, Engineering, Computer Science, Operations Research, or other quantitative field (advanced degrees are a plus)Strong SQL...",http://www.indeed.com/rc/clk?jk=b0f406f28505d810&fccid=f766f8bfbc3effb7,"San Francisco, CA 94103 (South Of Market area)",Data Scientist - UberPOOL (LatAm-Focused),"$70,000-$90,000",1
1,Precision Oncology,"Analyzing data using multiple tools, and delivering well written reports. Relevant industry experience may be considered in lieu of education....",http://www.indeed.com/rc/clk?jk=6b4673adfd7dec79&fccid=5ecb514bf664a144,"San Francisco, CA",Scientist II,"$70,000-$90,000",1
2,"Illumina, Inc.","Our focus on innovation has established us as the global leader in DNA sequencing and array-based technologies, serving customers in the research, clinical and...",http://www.indeed.com/rc/clk?jk=1d4c218aeb453b0c&fccid=8524239b088a6649,"San Francisco, CA 94158 (South Of Market area)",Field Applications Scientist 1,"$70,000-$90,000",1
3,5D Oncolytic Immunotherapeutics Inc,"Demonstrates innovative design, development and execution of research projects through literature review and scientific teamwork....",http://www.indeed.com/rc/clk?jk=063d144ed2996c08&fccid=eb4514e3ce0dc867,"Alameda, CA 94501","Scientist, Virologist and cancer models","$70,000-$90,000",1
4,Genentech,Act as a key liaison between bioanalytical scientists and Clinical Data Management and Statistical Programming....,http://www.indeed.com/rc/clk?jk=01496d7da39c6c1f&fccid=2525cc4a9a704809,"South San Francisco, CA",Bioanalytical Data Manager,"$70,000-$90,000",1
5,University of California San Francisco,"The individual will create, clean, and manage analytic data sets and take a lead role in conducting statistical analyses in support of the Program in Clinical...",http://www.indeed.com/rc/clk?jk=c1a120d027a9cdfe&fccid=2a341562d64c7cdb,"San Francisco, CA",Lead Statistician/SAS Programmer,"$70,000-$90,000",1
6,Gilead Sciences,"Works in collaboration with more senior scientists or scientific directors to identify and validate targets, advances the development of economical, state-of...",http://www.indeed.com/rc/clk?jk=67ea47ffa3e0f7fc&fccid=e4b075354d7c2865,"Foster City, CA",Research Scientist II- Inflammation,"$70,000-$90,000",1
7,Gilead,"Works in collaboration with more senior scientists or scientific directors to identify and validate targets, advances the development of economical, state-of...",http://www.indeed.com/rc/clk?jk=84bf625e73357d07&fccid=0aed3f67f6a631df,"Foster City, CA",Research Scientist II- Inflammation,"$70,000-$90,000",1
8,Gilead Sciences,"Works under supervision of more senior scientists or scientific directors to identify and validate targets, advance the development of economical, state-of-the...",http://www.indeed.com/rc/clk?jk=59a7ed49adbc58b6&fccid=e4b075354d7c2865,"Foster City, CA","Research Scientist I, Medicinal Chemistry","$70,000-$90,000",1
9,Gilead,"Works under supervision of more senior scientists or scientific directors to identify and validate targets, advance the development of economical, state-of-the...",http://www.indeed.com/rc/clk?jk=0be684b9081ea9d0&fccid=0aed3f67f6a631df,"Foster City, CA","Research Scientist I, Medicinal Chemistry","$70,000-$90,000",1


In [7]:
bigdf.duplicated(subset=['company', 'job_title', 'job_link', 'job_location', 'description']).value_counts()

False    1325
dtype: int64

In [8]:
bigdf.job_location.value_counts()

San Francisco, CA                                     460
Redwood City, CA                                       71
South San Francisco, CA                                65
Foster City, CA                                        60
San Francisco, CA 94105 (Financial District area)      54
South San Francisco, CA 94080                          43
San Francisco, CA 94103 (South Of Market area)         42
San Ramon, CA                                          35
Emeryville, CA                                         33
San Mateo, CA                                          32
Berkeley, CA 94720                                     32
San Francisco, CA 94107 (South Of Market area)         29
Novato, CA                                             26
Emeryville, CA 94608                                   25
San Bruno, CA 94066                                    24
Berkeley, CA                                           23
San Carlos, CA 94070                                   21
Redwood City, 

In [25]:
import sklearn.feature_extraction
import sklearn.linear_model
import sklearn.preprocessing

tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
X = tfidf.fit_transform(list(bigdf['job_title']))
X



<1325x776 sparse matrix of type '<type 'numpy.float64'>'
	with 5197 stored elements in Compressed Sparse Row format>

In [48]:
import sklearn.tree
tree = sklearn.tree.DecisionTreeRegressor()
tree.fit(X,list(bigdf.salary_bins))

In [50]:
logreg = sklearn.linear_model.LogisticRegression()
logreg.fit(X,list(bigdf.salary_bins))
zip(tfidf.get_feature_names(), logreg.coef_[0])

[(u'012', -0.093753736233359086),
 (u'2016', 0.19582407611868899),
 (u'70595', -0.089378478982899717),
 (u'71404', 0.26570965931826229),
 (u'7399', 0.28147267737573622),
 (u'academics', -0.0282998542206571),
 (u'access', 0.19037361274374859),
 (u'account', 0.18664465013205622),
 (u'acerta', -0.37737823093583556),
 (u'acquisition', -0.19877226012243118),
 (u'actuary', 0.059875216615705766),
 (u'ad', -0.33817864626929328),
 (u'add', 0.29989723276079538),
 (u'admin', -0.086592083278942925),
 (u'administrator', 0.40973080569697373),
 (u'ads', -0.13607815184337016),
 (u'advanced', 0.11774785513353628),
 (u'advertising', -0.14678690988616658),
 (u'ae', -0.19210545541629431),
 (u'aeroelasticity', -0.1384025952003505),
 (u'affairs', -0.12434191437245758),
 (u'agent', 0.44873769997112672),
 (u'ai', -0.1743578109836173),
 (u'algo', -0.07004126241505472),
 (u'algorithm', 0.52748622414546342),
 (u'algorithms', -0.05022362102958728),
 (u'all', -0.39796679906496552),
 (u'alto', -0.057935983323066577

In [None]:
#df.to_csv('./indeed_companies.csv', encoding='utf-8')