In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Function to create DataFrame from search

In [2]:
def search_df(location, salary):
    '''Scrapes indeed.com search results into a dataframe'''
    df = pd.DataFrame()
    pd.set_option('max_colwidth', 500)
    
    base_url = 'http://www.indeed.com/jobs?q=data+scientist'
    sal_url = '&salary=' + salary
    loc_url = '&l=' + location
    end_url = '&jt=fulltime&sort=date&start='
    for page in range(1,41):
        page = (page-1) * 10
        url = "%s%s%s%s%d" % (base_url, sal_url, loc_url, end_url, page)

        contents = BeautifulSoup(requests.get(url).text, 'lxml')    

        contentsElements = contents.find_all('div', attrs={'class' : '  row  result'}) 

        for elem in contentsElements: 
            try:
                company = elem.find('span', attrs={'itemprop':'name'}).getText().strip()
            except AttributeError:
                company = '-'
            job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
            home_url = "http://www.indeed.com"
            job_link = "%s%s" % (home_url,elem.find('a').get('href'))
            job_addr = elem.find('span', attrs={'itemprop':'addressLocality'}).getText()
            job_posted = elem.find('span', attrs={'class': 'date'}).getText()
            description = elem.find('span', attrs={'class': 'summary'}).getText().strip()
            #try:
            #    salary = elem.find('nobr').getText()
            #except AttributeError:
            #    salary = None

            df = df.append({'company': company, 
                            'job_title': job_title, 
                            'job_link': job_link, 
                            #'job_posted': job_posted,
                            'job_location': job_addr,
                            #'salary': salary,
                            'description': description
                           }, ignore_index=True)

        lastElement = contents.find('div', attrs={'class' : 'lastRow  row  result'})
        try:
            company = lastElement.find('span', attrs={'itemprop':'name'}).getText().strip()
        except AttributeError:
            company = '-'
        job_title = lastElement.find('a', attrs={'class':'turnstileLink'}).attrs['title']
        home_url = "http://www.indeed.com"
        job_link = "%s%s" % (home_url,lastElement.find('a').get('href'))
        job_addr = lastElement.find('span', attrs={'itemprop':'addressLocality'}).getText()
        job_posted = lastElement.find('span', attrs={'class': 'date'}).getText()
        description = lastElement.find('span', attrs={'class': 'summary'}).getText().strip()
        #try:
        #    salary = lastElement.find('nobr').getText()
        #except AttributeError:
        #    salary = None

        df = df.append({'company': company, 
                        'job_title': job_title, 
                        'job_link': job_link, 
                        #'job_posted': job_posted,
                        'job_location': job_addr,
                        #'salary': salary,
                        'description': description
                        }, ignore_index=True)
    df['salary'] = salary
    return df.drop_duplicates()


# Function to search by salary bins and concat to larger dataframe

In [3]:
def make_bins(location):
    '''Creates a dataframe for each salary bin and concats into one big dataframe'''
    salary_bins = {1: '$65,000-$90,000',
                   2: '$90,000-$110,000',
                   3: '$110,000-$130,000',
                   4: '$130,000'}

    df1 = search_df(location, salary_bins[1])
    df2 = search_df(location, salary_bins[2])
    df3 = search_df(location, salary_bins[3])
    df4 = search_df(location, salary_bins[4])

    bigdf = pd.concat([df1,df2,df3,df4])
    
    inv_map = {v: k for k, v in salary_bins.items()}
    bigdf['salary_bins'] = bigdf.salary.map(inv_map)
    
    return bigdf
    bigdf.salary.value_counts()

sanfran_df = make_bins('san francisco')

In [4]:
sanfran_df.head(20)

Unnamed: 0,company,description,job_link,job_location,job_title,salary,salary_bins
0,Pacific BioLabs,PBL is committed to being the “Service Leader in Bioscience Testing.” Studies at PBL are conducted in accordance with current Good Manufacturing Practice (cGMP)...,http://www.indeed.com/rc/clk?jk=b325617f6176a2f8&fccid=e00fb388cc5ec3ee,"Hercules, CA 94547",Scientist (Toxicology),"$65,000-$90,000",1
1,Boutique Air,Excellent data collection and analysis skills. Conduct industry-specific data analyses and present findings in a clear and concise manner....,http://www.indeed.com/rc/clk?jk=cd120d13324a0807&fccid=7024cfb624b4025e,"San Francisco, CA",Research Analyst,"$65,000-$90,000",1
2,Peralta Community College District,"Analyses data to develop reports; Work with District to ensure that training materials are maintained and updated, and that regular and ongoing training is...",http://www.indeed.com/rc/clk?jk=446619c7b5b8e25a&fccid=22830019c2a93e1d,"Oakland, CA",Research and Systems Technology Analyst,"$65,000-$90,000",1
3,csg,Experience writing and/or integrating Runtime for R/SAS for statistic services to provide predictive analytics....,http://www.indeed.com/rc/clk?jk=081ef43d40adc043&fccid=a56aa6958ed63da1,"San Francisco, CA",Statistical Programmer III (SpotFire),"$65,000-$90,000",1
4,Omniox Inc.,We are seeking an individual for an Associate Scientist / Scientist position in our Oncology Research group....,http://www.indeed.com/rc/clk?jk=1c77995163dbd445&fccid=c7ca781f5383c441,"San Carlos, CA 94070",Associate Scientist / Scientist,"$65,000-$90,000",1
5,Ascent Services Group,"Responsibilities will also include data processing, statistical analysis, and dissemination of results to project teams in support of drug discovery and...",http://www.indeed.com/rc/clk?jk=7ce4762d41c784cf&fccid=55051b7d98da371e,"South San Francisco, CA 94080",DMPK-BA Scientist,"$65,000-$90,000",1
6,"Veracyte, Inc.","We are seeking a new Clinical Laboratory Scientist (CLS) to perform cutting-edge diagnostic testing on patient specimens in our collaborative, high-energy work...",http://www.indeed.com/rc/clk?jk=3bcf34b4bff9d910&fccid=ef82ab85e15d8df3,"South San Francisco, CA 94080",Clinical Lab Scientist (CLS),"$65,000-$90,000",1
7,Gilead Sciences,"Works under supervision of more senior scientists or scientific directors to identify and validate targets, advance the development of economical, state-of-the...",http://www.indeed.com/rc/clk?jk=72e83129e901ef46&fccid=e4b075354d7c2865,"Foster City, CA","Research Scientist I, Medicinal Chemistry","$65,000-$90,000",1
8,Gilead,"Works under supervision of more senior scientists or scientific directors to identify and validate targets, advance the development of economical, state-of-the...",http://www.indeed.com/rc/clk?jk=58878406cc8cbf08&fccid=0aed3f67f6a631df,"Foster City, CA","Research Scientist I, Medicinal Chemistry","$65,000-$90,000",1
9,Gilead Sciences,Independently plans and executes assigned experiments that support non-routine research activities and project goals....,http://www.indeed.com/rc/clk?jk=4e8bea0ab08f1879&fccid=e4b075354d7c2865,"Foster City, CA","Associate Scientist II, Immunology","$65,000-$90,000",1


In [5]:
sanfran_df.duplicated(subset=['company', 'job_title', 'job_link', 'job_location', 'description']).value_counts()

False    1353
dtype: int64

In [6]:
sanfran_df.job_location.value_counts()[:15]

San Francisco, CA                                    468
Foster City, CA                                       71
Redwood City, CA                                      66
South San Francisco, CA                               64
San Francisco, CA 94105 (Financial District area)     52
South San Francisco, CA 94080                         47
San Francisco, CA 94103 (South Of Market area)        45
Emeryville, CA                                        34
Berkeley, CA 94720                                    34
San Mateo, CA                                         33
San Ramon, CA                                         31
San Francisco, CA 94107 (South Of Market area)        27
San Bruno, CA 94066                                   27
Emeryville, CA 94608                                  26
Novato, CA                                            25
Name: job_location, dtype: int64

# Logistic Regression on Job Title words

In [32]:
import sklearn.feature_extraction
import sklearn.linear_model
import sklearn.preprocessing
import nltk

def vectorize_words_low(df, column):
    '''
    Runs a logistic regressor on the tfidf-vectorized words of a column
    and returns coefficients corresponding to a low salary
    '''
    tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
    X = tfidf.fit_transform(list(df[column]))

    logreg = sklearn.linear_model.LogisticRegression()
    logreg.fit(X,list(df.salary_bins))
    word_list_stopped = zip(tfidf.get_feature_names(), logreg.coef_[0])

    stopwords = nltk.corpus.stopwords.words('english')
    word_list = []
    for i in range(len(word_list_stopped)):
        if word_list_stopped[i][0] not in stopwords:
            word_list.append(word_list_stopped[i])
    return word_list

def vectorize_words_high(df, column):
    '''
    Runs a logistic regressor on the tfidf-vectorized words of a column
    and returns coefficients corresponding to a high salary
    '''
    tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
    X = tfidf.fit_transform(list(df[column]))

    logreg = sklearn.linear_model.LogisticRegression()
    logreg.fit(X,list(df.salary_bins))
    word_list_stopped = zip(tfidf.get_feature_names(), logreg.coef_[3])

    stopwords = nltk.corpus.stopwords.words('english')
    word_list = []
    for i in range(len(word_list_stopped)):
        if word_list_stopped[i][0] not in stopwords:
            word_list.append(word_list_stopped[i])
    return word_list


** Good Words **

In [47]:
sorted(vectorize_words_high(sanfran_df, 'job_title'), key=lambda word: word[1], reverse=True)[:21]

[(u'architect', 2.3481145855585956),
 (u'data', 2.2915480179214538),
 (u'director', 2.2366911218569232),
 (u'platform', 1.9903144762241087),
 (u'analytics', 1.8945751503772297),
 (u'product', 1.618211759933506),
 (u'lead', 1.5342554737845895),
 (u'staff', 1.5228286392207844),
 (u'head', 1.5007845876861634),
 (u'learning', 1.445500774679207),
 (u'java', 1.3829570682457117),
 (u'security', 1.164382238828773),
 (u'vp', 1.1311914573303106),
 (u'principal', 1.0521888598056151),
 (u'management', 1.0506238786871354),
 (u'capital', 1.0471532197830991),
 (u'senior', 1.0276100400256041),
 (u'software', 1.0258622287908556),
 (u'sciences', 1.0009573139925727),
 (u'services', 0.97884617459852596),
 (u'solutions', 0.95842768060184891)]

In [48]:
sorted(vectorize_words_low(sanfran_df, 'job_title'), key=lambda word: word[1], reverse=False)[:21]

[(u'director', -2.6373506341414257),
 (u'sr', -2.1777330855629069),
 (u'software', -2.022404217001244),
 (u'data', -2.0111081942096272),
 (u'lead', -1.6745714391011606),
 (u'product', -1.673233011003364),
 (u'senior', -1.5971119187176119),
 (u'manager', -1.5473032186260576),
 (u'principal', -1.5435881737535635),
 (u'engineer', -1.4381074612942941),
 (u'quantitative', -1.3780510194459068),
 (u'job', -1.1242459783738565),
 (u'learning', -1.0630688283733889),
 (u'machine', -1.0014920874632727),
 (u'analytics', -0.97229787866463069),
 (u'platform', -0.95851137881851289),
 (u'architect', -0.95833452822996645),
 (u'intelligence', -0.95495408731984432),
 (u'end', -0.89438452013308367),
 (u'biologist', -0.8207232462319235),
 (u'backend', -0.79897175389967179)]

** Bad Words **

In [50]:
sorted(vectorize_words_low(sanfran_df, 'job_title'), key=lambda word: word[1], reverse=True)[:21]

[(u'associate', 2.0514095250308748),
 (u'environmental', 1.7750225217667561),
 (u'research', 1.6419130987013015),
 (u'postdoctoral', 1.6084053501687956),
 (u'statistical', 1.4645938003118719),
 (u'ii', 1.1724837164637154),
 (u'laboratory', 1.1453565138481627),
 (u'support', 1.0257042077260865),
 (u'development', 1.0141699539915066),
 (u'bioanalytical', 0.99691765205236282),
 (u'analytical', 0.93684526061355611),
 (u'scientist', 0.92965631703377971),
 (u'recruiter', 0.91562078374395872),
 (u'scholar', 0.90734580395634989),
 (u'lab', 0.90577518220229103),
 (u'chemistry', 0.89406034404573442),
 (u'fermentation', 0.88746322662642352),
 (u'fpd', 0.85086090829754579),
 (u'quality', 0.79708734011498517),
 (u'biostatistician', 0.77885991507643704),
 (u'growth', 0.77035309120466866)]

In [51]:
sorted(vectorize_words_high(sanfran_df, 'job_title'), key=lambda word: word[1], reverse=False)[:21]

[(u'analyst', -2.491917886485147),
 (u'development', -1.4661334773144976),
 (u'scientist', -1.3356461231860448),
 (u'research', -1.2653415202959388),
 (u'statistical', -1.140257066979641),
 (u'clinical', -1.0895403654501699),
 (u'biology', -0.90551609902247743),
 (u'associate', -0.84764979688592823),
 (u'ii', -0.82491715248545827),
 (u'chemistry', -0.80732708941409403),
 (u'analytical', -0.77321827924561981),
 (u'postdoctoral', -0.73661604742491227),
 (u'specialist', -0.68937323650220861),
 (u'designer', -0.67758907021325787),
 (u'customer', -0.65354182957699725),
 (u'statistician', -0.62525002857053691),
 (u'quality', -0.58898420701744258),
 (u'quantitative', -0.58341834889302202),
 (u'fellow', -0.5673604883736546),
 (u'ad', -0.56153652083542305),
 (u'applications', -0.52897410986446936)]

In [46]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
X = tfidf.fit_transform(list(sanfran_df['job_title']))

logreg = sklearn.linear_model.LogisticRegression()
logreg.fit(X,list(sanfran_df.salary_bins))
logreg.predict_proba(tfidf.transform(['senior']))

array([[ 0.08851787,  0.14414371,  0.45273961,  0.31459881]])

In [10]:
#df.to_csv('./indeed_companies.csv', encoding='utf-8')

In [11]:
sanjose_df = make_bins('san jose')
seattle_df = make_bins('seattle')

In [62]:
sorted(vectorize_words_low(sanfran_df, 'description'), key=lambda word: word[1], reverse=False)[:21]

[(u'data', -2.1768714815723436),
 (u'machine', -1.9175904726360491),
 (u'scientists', -1.7294372928252328),
 (u'learning', -1.6450902176785878),
 (u'health', -1.2125637570789356),
 (u'analytics', -1.1940417631998441),
 (u'science', -1.0839394766574897),
 (u'designers', -0.87252785736429994),
 (u'business', -0.86187934149362755),
 (u'product', -0.85910481957866647),
 (u'managers', -0.84753144090308208),
 (u'user', -0.7744654920752585),
 (u'advanced', -0.73715112133838967),
 (u'working', -0.73478133318464134),
 (u'solutions', -0.72395261971789548),
 (u'closely', -0.71304251681304132),
 (u'build', -0.692100975456446),
 (u'deep', -0.67885517286215991),
 (u'analysts', -0.66384575057991058),
 (u'engineer', -0.66075335759018239),
 (u'drive', -0.65720299033979834)]