# Web Scraping for Indeed.com & Predicting Salaries

In [20]:
# importing necessary libraries
import requests
import bs4
from bs4 import BeautifulSoup
import urllib
import re
from re import findall
import numpy as np
import pandas as pd
from time import sleep

In [130]:
# Practicing Scraping on one page first
url = "https://www.indeed.com/jobs?q=data+scientist+$20,000&l=Pittsburgh&start=0"


In [131]:
# visiting that url and grabbing the html of page
html = requests.get(url)

In [132]:
# converting into a soup object
b = BeautifulSoup(html.text)

In [133]:
# assuring I get a response of 200
html

<Response [200]>

In [76]:
# defining function for grabbing titles
def extract_title(result):
    titles = []
    for i in result.find_all(class_='jobtitle'):
        titles.append(i.get_text().strip())
    
    
    return titles


In [41]:
extract_title(b)

[u'Research Scientist - Center for Biomedical Science',
 u'Data Scientist',
 u'Senior Data Engineer (Python, Hadoop, Machine Learning)',
 u'Machine Learning Engineer - Intern',
 u'Analyst, Data & Analytics',
 u'Marketing Data Analyst',
 u'Data Science Intern',
 u'Machine Learning Software Engineer',
 u'Machine Learning Engineer - Intern',
 u'Machine Learning Engineer - Intern',
 u'Junior Data Scientist/Data Scientist',
 u'Tech: Data Analyst',
 u'Data Scientist',
 u'Healthcare & Life Sciences - Data Scientist',
 u'Associate, Data Scientist - Optimization']

Writing functions for extracting the locations, companies, and salaries

In [28]:
def extract_locations(result):
    locations = []
    for i in result.find_all('span', class_='location'):
        locations.append(i.text)
    return locations 

In [29]:
def extract_company(result):
    companies = []
    for i in b.find_all('span', class_='company'):
        companies.append(i.get_text().strip())
    return companies

In [30]:
def extract_salary(result):
    salary = []
    for entry in b.find_all('td', class_ = 'snip'):
        match = re.findall('\$[0-9]*.[0-9]* - \$[0-9]*.[0-9]* [a-zA-Z]* [a-zA-Z]*', entry.text)
        if match:
            salary.append(match)
        else:
            salary.append('None')
    return salary

In [31]:
extract_salary(b)

['None',
 'None',
 [u'$150,000 - $200,000 a year'],
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None']

## Scraper

In [258]:
url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city =  500
# Crawling more results, will also take much longer. First test your code on a small number of results and then expand.


# htmls = urllib.urlopen(url_template).read()
# converting into a soup object
# bs = BeautifulSoup(html, 'html.parser', from_encoding="utf-8")


titles = []
locations = []
companies = []
salaries = []


for city in set(['Chicago', 'Washington+City+DC', 'Austin', 'San+Diego', 'St.+Louis', 
    'Los+Angeles', 'Dallas', 'Boston', 'Seattle', 'Charlotte', 'Houston',
    'Pittsburgh', 'San+Antonio', 'Phoenix', 'Denver']):
    for start in range(0, max_results_per_city, 10):
        # Grab the results from the request (as above)
        sleep(0.3)
        url = url_template.format(city, start)
        htmls = requests.get(url)
        
        # Printing url to track progress
        #print url
            
        # bs = BeautifulSoup(urllib.urlopen(url).read(), 'html.parser')
        bs = BeautifulSoup(htmls.text)
        
        for i in bs.find_all('a', attrs={'data-tn-element': "jobTitle"}):
            titles.append(i.get_text().strip())
            
        # Grabbing Locations
        for i in bs.find_all('span', class_='location'):
            locations.append(i.text)

        # Grabbing Company Names
        for i in bs.find_all('span', class_='company'):
            companies.append(i.get_text().strip())

        # Grabbing Salaries
        for entry in bs.find_all('td', class_='snip'):
            match = re.findall('\$[0-9]*.[0-9]* - \$[0-9]*.[0-9]* [a-zA-Z]* [a-zA-Z]*', entry.text)
            if match:
                salaries.append(match)
            else:
                salaries.append('None')
         
        # printing list lengths to assure efficiency of scraper
        print city + ": " + str(start) + ": " + str(len(titles)) + ", " + str(len(companies)) + ", " + str(len(salaries)) + ", " + str(len(locations))

           
                    

                


Charlotte: 0: 15, 15, 15, 15
Charlotte: 10: 30, 30, 30, 30
Charlotte: 20: 45, 45, 45, 45
Charlotte: 30: 60, 60, 60, 60
Charlotte: 40: 75, 75, 75, 75
Charlotte: 50: 90, 90, 90, 90
Charlotte: 60: 104, 104, 104, 104
Charlotte: 70: 119, 119, 119, 119
Charlotte: 80: 134, 134, 134, 134
Charlotte: 90: 159, 159, 159, 159
Charlotte: 100: 174, 174, 174, 174
Charlotte: 110: 189, 189, 189, 189
Charlotte: 120: 204, 204, 204, 204
Charlotte: 130: 219, 219, 219, 219
Charlotte: 140: 234, 234, 234, 234
Charlotte: 150: 249, 249, 249, 249
Charlotte: 160: 264, 264, 264, 264
Charlotte: 170: 279, 279, 279, 279
Charlotte: 180: 294, 294, 294, 294
Charlotte: 190: 309, 309, 309, 309
Charlotte: 200: 324, 324, 324, 324
Charlotte: 210: 339, 339, 339, 339
Charlotte: 220: 354, 354, 354, 354
Charlotte: 230: 369, 369, 369, 369
Charlotte: 240: 384, 384, 384, 384
Charlotte: 250: 399, 399, 399, 399
Charlotte: 260: 414, 414, 414, 414
Charlotte: 270: 429, 429, 429, 429
Charlotte: 280: 444, 444, 444, 444
Charlotte: 290: 459,

Boston: 250: 3472, 3472, 3472, 3472
Boston: 260: 3487, 3487, 3487, 3487
Boston: 270: 3502, 3502, 3502, 3502
Boston: 280: 3517, 3517, 3517, 3517
Boston: 290: 3532, 3532, 3532, 3532
Boston: 300: 3547, 3547, 3547, 3547
Boston: 310: 3562, 3562, 3562, 3562
Boston: 320: 3577, 3577, 3577, 3577
Boston: 330: 3592, 3592, 3592, 3592
Boston: 340: 3607, 3607, 3607, 3607
Boston: 350: 3622, 3622, 3622, 3622
Boston: 360: 3637, 3637, 3637, 3637
Boston: 370: 3652, 3652, 3652, 3652
Boston: 380: 3667, 3667, 3667, 3667
Boston: 390: 3682, 3682, 3682, 3682
Boston: 400: 3697, 3697, 3697, 3697
Boston: 410: 3712, 3712, 3712, 3712
Boston: 420: 3727, 3727, 3727, 3727
Boston: 430: 3742, 3742, 3742, 3742
Boston: 440: 3757, 3757, 3757, 3757
Boston: 450: 3772, 3772, 3772, 3772
Boston: 460: 3787, 3787, 3787, 3787
Boston: 470: 3802, 3802, 3802, 3802
Boston: 480: 3817, 3817, 3817, 3817
Boston: 490: 3832, 3832, 3832, 3832
Washington+City+DC: 0: 3847, 3847, 3847, 3847
Washington+City+DC: 10: 3862, 3862, 3862, 3862
Washing

Los+Angeles: 330: 6691, 6691, 6691, 6691
Los+Angeles: 340: 6706, 6706, 6706, 6706
Los+Angeles: 350: 6721, 6721, 6721, 6721
Los+Angeles: 360: 6736, 6736, 6736, 6736
Los+Angeles: 370: 6751, 6751, 6751, 6751
Los+Angeles: 380: 6766, 6766, 6766, 6766
Los+Angeles: 390: 6781, 6781, 6781, 6781
Los+Angeles: 400: 6796, 6796, 6796, 6796
Los+Angeles: 410: 6811, 6811, 6811, 6811
Los+Angeles: 420: 6836, 6836, 6836, 6836
Los+Angeles: 430: 6851, 6851, 6851, 6851
Los+Angeles: 440: 6866, 6866, 6866, 6866
Los+Angeles: 450: 6881, 6881, 6881, 6881
Los+Angeles: 460: 6896, 6896, 6896, 6896
Los+Angeles: 470: 6911, 6911, 6911, 6911
Los+Angeles: 480: 6926, 6926, 6926, 6926
Los+Angeles: 490: 6941, 6941, 6941, 6941
San+Antonio: 0: 6956, 6956, 6956, 6956
San+Antonio: 10: 6971, 6971, 6971, 6971
San+Antonio: 20: 6986, 6986, 6986, 6986
San+Antonio: 30: 7001, 7001, 7001, 7001
San+Antonio: 40: 7016, 7016, 7016, 7016
San+Antonio: 50: 7031, 7031, 7031, 7031
San+Antonio: 60: 7046, 7046, 7046, 7046
San+Antonio: 70: 7061, 7

St.+Louis: 440: 9945, 9945, 9945, 9945
St.+Louis: 450: 9960, 9960, 9960, 9960
St.+Louis: 460: 9975, 9975, 9975, 9975
St.+Louis: 470: 9990, 9990, 9990, 9990
St.+Louis: 480: 10005, 10005, 10005, 10005
St.+Louis: 490: 10020, 10020, 10020, 10020
Seattle: 0: 10045, 10045, 10045, 10045
Seattle: 10: 10060, 10060, 10060, 10060
Seattle: 20: 10075, 10075, 10075, 10075
Seattle: 30: 10090, 10090, 10090, 10090
Seattle: 40: 10105, 10105, 10105, 10105
Seattle: 50: 10120, 10120, 10120, 10120
Seattle: 60: 10135, 10135, 10135, 10135
Seattle: 70: 10150, 10150, 10150, 10150
Seattle: 80: 10165, 10165, 10165, 10165
Seattle: 90: 10180, 10180, 10180, 10180
Seattle: 100: 10195, 10195, 10195, 10195
Seattle: 110: 10210, 10210, 10210, 10210
Seattle: 120: 10225, 10225, 10225, 10225
Seattle: 130: 10240, 10240, 10240, 10240
Seattle: 140: 10255, 10255, 10255, 10255
Seattle: 150: 10270, 10270, 10270, 10270
Seattle: 160: 10285, 10285, 10285, 10285
Seattle: 170: 10300, 10300, 10300, 10300
Seattle: 180: 10315, 10315, 103

In [259]:
# forming dataframe
data = {'Titles' : titles, 'Company' : companies, 'Location' : locations, 'Salary' : salaries}
df = pd.DataFrame(data)

# replacing all null salary values with true NaN
df.replace('None', np.nan, inplace=True)

# saving dataframe 
df.to_csv('jobs.csv', encoding='utf-8')

print (df.shape)
print (df.dtypes)
df.head()

(11570, 4)
Company     object
Location    object
Salary      object
Titles      object
dtype: object


Unnamed: 0,Company,Location,Salary,Titles
0,Sealed Air Corporation,"Charlotte, NC 28208",,Data Scientist IT Architecture-NSEE
1,Wells Fargo,"Charlotte, NC",,Data Scientist Lead - Quantitative Analytics C...
2,Principle Solutions Group,"Charlotte, NC 28202",,Data Scientist
3,JLL,"Charlotte, NC",,Research Analyst
4,Carolinas HealthCare System,"Charlotte, NC 28203 (Dilworth area)",,Statistician/Intermediate - Levine Cancer Inst...


In [260]:
# counting null salary values 
df.Salary.isnull().sum()

11003

Dropping all null values, which is a considerable portion of the dataframe.

In [261]:
df = df.dropna()
print df.shape
df.head()

(567, 4)


Unnamed: 0,Company,Location,Salary,Titles
11,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate
19,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ..."
26,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst
32,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst
33,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist


In [287]:
# Dropping all duplicates
df.drop_duplicates(['Company', 'Location', 'Titles'], inplace=True)

In [289]:
# ONLY INCLUDING ANNUAL SALARIES. THIS WILL RID THE DATAFRAME OF HOURLY INTERNSHIPS OR MONTHLY PROGRAMS
df = df[['year' in str(s) for s in df.Salary]]
df.head()

Unnamed: 0,Company,Location,Salary,Titles
11,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate
19,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ..."
26,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst
33,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist
41,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist


In [290]:
# converting salary to string type
df['New Salary'] = df.Salary.apply(lambda x: str(x))

In [291]:
# getting rid of wording, $, and - to convert to numerical value
df['New Salary'] = df['New Salary'].str.replace('a year', '')
df['New Salary'] = df['New Salary'].str.replace('$', '')
df['New Salary'] = df['New Salary'].str.replace('-', '')
df.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary
11,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']"
19,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']"
26,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']"
33,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']"
41,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']"


In [292]:
# separating ranges of salaries and then going to use average for predictions
df['SalaryII'] = df['New Salary'].str.extract('(\d+\,\d+)', expand=True)
df['SalaryIII'] = df['New Salary'].str.extract('( \d+\,\d+)', expand=True)
df.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII
11,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000,45000
19,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000,130000
26,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000,180000
33,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000,160000
41,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000,120000


In [293]:
# getting rid of commas to convert to floats 
df['SalaryII'] = df['SalaryII'].str.replace(',','')
df['SalaryIII'] = df['SalaryIII'].str.replace(',','')
df.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII
11,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000,45000
19,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000,130000
26,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000,180000
33,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000,160000
41,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000,120000


In [294]:
# converting to floats 
df['SalaryII'] = df["SalaryII"].astype(float)
df['SalaryIII'] = df["SalaryIII"].astype(float)
df.dtypes

Company        object
Location       object
Salary         object
Titles         object
New Salary     object
SalaryII      float64
SalaryIII     float64
dtype: object

In [295]:
# forming new column for average salary to use for predictions
df['Avg Salary'] = df[['SalaryII', 'SalaryIII']].mean(axis=1)
df.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII,Avg Salary
11,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000.0,45000.0,40000.0
19,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000.0,130000.0,120000.0
26,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000.0,180000.0,140000.0
33,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000.0,160000.0,150000.0
41,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000.0,120000.0,115000.0


### Saving results as a CSV

In [298]:
df.to_csv('jobs.csv', encoding='utf-8')

## Predicting salaries using Random Forests + Another Classifier


#### Load in the the data of scraped salaries

In [299]:
frame = pd.read_csv('../Web-Scraping-Project/jobs.csv')
frame.drop('Unnamed: 0', axis=1, inplace=True)

In [300]:
frame.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII,Avg Salary
0,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000.0,45000.0,40000.0
1,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000.0,130000.0,120000.0
2,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000.0,180000.0,140000.0
3,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000.0,160000.0,150000.0
4,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000.0,120000.0,115000.0


#### We want to predict a binary variable - whether the salary was low or high. Compute the median salary and create a new binary variable that is true when the salary is high (above the median)

We could also perform Linear Regression (or any regression) to predict the salary value here. Instead, we are going to convert this into a _binary_ classification problem, by predicting two classes, HIGH vs LOW salary.

While performing regression may be better, performing classification may help remove some of the noise of the extreme salaries. We don't _have_ to choose the `median` as the splitting point - we could also split on the 75th percentile or any other reasonable breaking point.

In fact, the ideal scenario may be to predict many levels of salaries, 

In [301]:
median = frame['Avg Salary'].median()
print median


92500.0


In [303]:
# Creating new binary column based on if AVG Salary is above or below median
frame['High Salary?'] = np.where(frame['Avg Salary'] > median, 1, 0)
frame.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII,Avg Salary,High Salary?
0,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000.0,45000.0,40000.0,0
1,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000.0,130000.0,120000.0,1
2,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000.0,180000.0,140000.0,1
3,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000.0,160000.0,150000.0,1
4,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000.0,120000.0,115000.0,1


In [304]:
frame.dtypes

Company          object
Location         object
Salary           object
Titles           object
New Salary       object
SalaryII        float64
SalaryIII       float64
Avg Salary      float64
High Salary?      int64
dtype: object

In [305]:
#creating new columns, seperating the Location column into two strings, seperated by a comma, 
# this will isolate the city name
frame['Location1'] = frame['Location'].apply(lambda x: x.split(','))

In [306]:
# creating new column of city only for each observation
frame['City'] = frame['Location1'].apply(lambda x: x[0])
frame

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII,Avg Salary,High Salary?,Location1,City
0,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000.0,45000.0,40000.0,0,"[Charlotte, NC 28202 (Downtown Charlotte area)]",Charlotte
1,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000.0,130000.0,120000.0,1,"[Charlotte, NC]",Charlotte
2,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000.0,180000.0,140000.0,1,"[Charlotte, NC]",Charlotte
3,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000.0,160000.0,150000.0,1,"[Charlotte, NC 28202 (Downtown Charlotte area)]",Charlotte
4,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000.0,120000.0,115000.0,1,"[Charlotte, NC]",Charlotte
5,Department of Agriculture,"Gastonia, NC","[$33,012 - $42,921 a year]",Physical Science Technician,"[u'33,012 42,921 ']",33012.0,42921.0,37966.5,0,"[Gastonia, NC]",Gastonia
6,"Kennedy Unlimited Inc, Professional Staffing","Charlotte, NC","[$130,000 - $140,000 a year]",Predictive Analytics (Machine Learning),"[u'130,000 140,000 ']",130000.0,140000.0,135000.0,1,"[Charlotte, NC]",Charlotte
7,"Kennedy Unlimited Inc, Professional Staffing","Charlotte, NC","[$120,000 - $150,000 a year]",Analytics Manager (Machine Learning),"[u'120,000 150,000 ']",120000.0,150000.0,135000.0,1,"[Charlotte, NC]",Charlotte
8,"Kennedy Unlimited Inc, Professional Staffing","Charlotte, NC","[$140,000 - $165,000 a year]",Data Scientist Data Modeling (Machine Learning...,"[u'140,000 165,000 ']",140000.0,165000.0,152500.0,1,"[Charlotte, NC]",Charlotte
9,"Kennedy Unlimited Inc, Professional Staffing","Charlotte, NC","[$140,000 - $155,000 a year]",Quantitative Analyst (Quantitative Research PhD),"[u'140,000 155,000 ']",140000.0,155000.0,147500.0,1,"[Charlotte, NC]",Charlotte


#### Create a Random Forest model to predict High/Low salary using Sklearn. Start by ONLY using the location as a feature. 

In [307]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold



In [309]:
# creating dummy variables for city
df_city = pd.get_dummies(frame['City'])
df_city.head()

Unnamed: 0,Arlington,Austin,Beltsville,Boston,Buckley AFB,Cambridge,Charlotte,Chicago,Concord,Dallas,...,San Diego,Scottsdale,Seattle,Silver Spring,Springfield,St. Louis,Tysons,Van Nuys,Washington,Wellesley
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [311]:
# concatenating city frame with original dataframe so I can predict salary with city
df_city = pd.concat([frame, df_city], axis=1)

In [312]:
df_city.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII,Avg Salary,High Salary?,Location1,...,San Diego,Scottsdale,Seattle,Silver Spring,Springfield,St. Louis,Tysons,Van Nuys,Washington,Wellesley
0,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000.0,45000.0,40000.0,0,"[Charlotte, NC 28202 (Downtown Charlotte area)]",...,0,0,0,0,0,0,0,0,0,0
1,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000.0,130000.0,120000.0,1,"[Charlotte, NC]",...,0,0,0,0,0,0,0,0,0,0
2,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000.0,180000.0,140000.0,1,"[Charlotte, NC]",...,0,0,0,0,0,0,0,0,0,0
3,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000.0,160000.0,150000.0,1,"[Charlotte, NC 28202 (Downtown Charlotte area)]",...,0,0,0,0,0,0,0,0,0,0
4,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000.0,120000.0,115000.0,1,"[Charlotte, NC]",...,0,0,0,0,0,0,0,0,0,0


In [314]:
# establishing an X and y
X = df_city.drop(['High Salary?', 'Titles', 'Location', 'Salary', 'Company', 'New Salary', 'SalaryII', 'SalaryIII', 
                 'Avg Salary', 'Location1', 'City'], axis=1)
y = df_city['High Salary?']

In [315]:
model = RandomForestClassifier()
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [316]:
a = model.feature_importances_
zip(a, X)

[(0.094179296174717181, 'Arlington'),
 (0.0095617493485209218, 'Austin'),
 (0.0082871170109823315, 'Beltsville'),
 (0.019996045311866696, 'Boston'),
 (0.01720390631897465, 'Buckley AFB'),
 (0.018466526613505518, 'Cambridge'),
 (0.058687665969864689, 'Charlotte'),
 (0.03101972414760467, 'Chicago'),
 (0.011593315815148771, 'Concord'),
 (0.013296518406696206, 'Dallas'),
 (0.018461298317923945, 'Denver'),
 (0.015975708013616134, 'Downers Grove'),
 (0.017896021218198704, 'El Segundo'),
 (0.025908267122026336, 'Fort Meade'),
 (0.01120991021881183, 'Fort Sam Houston'),
 (0.0092135578689917855, 'Framingham'),
 (0.0053293539934300129, 'Gaithersburg'),
 (0.014376649152808557, 'Gastonia'),
 (0.021394436791523381, 'Golden'),
 (0.011941821927479248, 'Granite City'),
 (0.031234060610676691, 'Greenbelt'),
 (0.049383009041895534, 'Houston'),
 (0.010932153245856262, 'Lackland AFB'),
 (0.016246759539921496, 'Lexington'),
 (0.016448517901372854, 'Long Beach'),
 (0.027130809194403121, 'Los Angeles'),
 (0.

#### Create a few new variables in your dataframe to represent interesting features of a job title.
- For example, create a feature that represents whether 'Senior' is in the title or whether 'Manager' is in the title. 
- Then build a new Random Forest with these features. Do they add any value?
- After creating these variables, use count-vectorizer to create features based on the words in the job titles.
- Build a new random forest model with location and these new features included.

In [317]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

In [318]:
cvec = CountVectorizer()

In [319]:
cvec.fit(frame['Titles'])

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [320]:
cvecdata= cvec.transform(frame['Titles'])

In [321]:
# creating count vectorizer DataFrame
cvecframe  = pd.DataFrame(cvecdata.todense(),
             columns=cvec.get_feature_names())


frame.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII,Avg Salary,High Salary?,Location1,City
0,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000.0,45000.0,40000.0,0,"[Charlotte, NC 28202 (Downtown Charlotte area)]",Charlotte
1,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000.0,130000.0,120000.0,1,"[Charlotte, NC]",Charlotte
2,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000.0,180000.0,140000.0,1,"[Charlotte, NC]",Charlotte
3,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000.0,160000.0,150000.0,1,"[Charlotte, NC 28202 (Downtown Charlotte area)]",Charlotte
4,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000.0,120000.0,115000.0,1,"[Charlotte, NC]",Charlotte


In [322]:
cvecframe.head()

Unnamed: 0,11,12,13,1301,14,1529,1530,administrator,advisor,ai,...,toxicology,translational,until,value,vice,vitro,vivo,vp,water,web
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [324]:
# concatenating dataframes to make predictions based on words
cvecframe = pd.concat([frame, cvecframe], axis=1)

In [326]:
pd.options.display.max_columns = 999

In [327]:
cvecframe.head()

Unnamed: 0,Company,Location,Salary,Titles,New Salary,SalaryII,SalaryIII,Avg Salary,High Salary?,Location1,City,11,12,13,1301,14,1529,1530,administrator,advisor,ai,algorithm,als,analysis,analyst,analytical,analytics,and,architect,assay,assistant,assoc,associate,aws,azure,bakery,based,behavioral,big,bio,bioinformatics,biology,biostatistician,biostatistics,business,cardiology,census,certifying,chancellor,charlotte,chemist,chemistry,chinese,clinical,cloud,cnup,coast,computational,computer,contracting,core,cyber,data,de,deep,deu,develop,developer,developers,development,diagnostic,digital,director,discovery,disease,doc,downtown,drug,ec,economist,embedded,engineer,engineering,engineers,enterprise,entry,environmental,epidemiologist,erlang,experimental,filled,finance,for,full,general,genetics,geoint,gg,government,green,gs,guard,gyn,gynecologic,hadoop,health,human,hydrologist,ii,iii,in,infectious,information,infrastructure,institute,institutional,integration,intelligence,interdisciplinary,java,jmw,jnt,junior,laboratory,language,lcms,lead,learning,level,machine,manager,marine,marketing,mathematical,medical,medicinal,metabolomics,mid,modeler,modeling,natural,nc,neuroscience,nlp,npo,ob,of,officer,oncology,open,operation,operations,part,pb,pediatrics,pharmacology,phd,physical,planner,polyglot,predictive,principal,processing,products,professional,program,programmer,project,prospect,python,qa,qi,quality,quantitative,radiation,radiology,reporting,reproductiv,research,researcher,rn,sas,science,sciences,scientist,securitized,security,senior,services,social,software,solution,spark,speaking,specialist,sql,sr,staff,startup,statistical,statistician,steward,supervising,supervisor,supervisory,survey,surveyor,survivability,system,systems,tableau,team,technician,technologist,threat,time,tlg,toxicology,translational,until,value,vice,vitro,vivo,vp,water,web
0,Corvid,"Charlotte, NC 28202 (Downtown Charlotte area)","[$35,000 - $45,000 a year]",Business Operations Associate,"[u'35,000 45,000 ']",35000.0,45000.0,40000.0,0,"[Charlotte, NC 28202 (Downtown Charlotte area)]",Charlotte,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Nityo Infotech Services Pvt. Ltd.,"Charlotte, NC","[$110,000 - $130,000 a year]","Machine Learning Developer - Charlotte, NC // ...","[u'110,000 130,000 ']",110000.0,130000.0,120000.0,1,"[Charlotte, NC]",Charlotte,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,Prospect Infosys Inc.,"Charlotte, NC","[$100,000 - $180,000 a year]",Sr Quantitative Finance Analyst,"[u'100,000 180,000 ']",100000.0,180000.0,140000.0,1,"[Charlotte, NC]",Charlotte,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Principle Solutions Group,"Charlotte, NC 28202 (Downtown Charlotte area)","[$140,000 - $160,000 a year]",Data Scientist,"[u'140,000 160,000 ']",140000.0,160000.0,150000.0,1,"[Charlotte, NC 28202 (Downtown Charlotte area)]",Charlotte,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Oliver James Associates,"Charlotte, NC","[$110,000 - $120,000 a year]",Senior Data Scientist,"[u'110,000 120,000 ']",110000.0,120000.0,115000.0,1,"[Charlotte, NC]",Charlotte,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [329]:
# Assigning values to X and y 
X1 = cvecframe.drop(['High Salary?', 'Titles', 'Location', 'Salary', 'Company', 'New Salary', 'SalaryII', 'SalaryIII', 
                 'Avg Salary', 'Location1', 'City', '11', '12', '13', '1301','14', '1529', '1530'], axis=1)
y1 = cvecframe['High Salary?']

In [330]:
model_1 = RandomForestClassifier()
model_1.fit(X1, y1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [331]:
d = model_1.feature_importances_
zip(d, X1)

[(0.00037653725944831567, u'administrator'),
 (0.0012912006315410982, u'advisor'),
 (0.00019311637261175597, u'ai'),
 (0.0019951937923120143, u'algorithm'),
 (0.0, u'als'),
 (0.0044972620581028144, u'analysis'),
 (0.028760994007099756, u'analyst'),
 (0.00434836353885386, u'analytical'),
 (0.011148543911877546, u'analytics'),
 (0.00020200429019131974, u'and'),
 (0.0031451783975356253, u'architect'),
 (0.0, u'assay'),
 (0.0036608306448606838, u'assistant'),
 (0.0058772149167165026, u'assoc'),
 (0.0093336764598454108, u'associate'),
 (0.0049149774045045141, u'aws'),
 (0.0018011398594287172, u'azure'),
 (0.0089601444540816713, u'bakery'),
 (0.0020945990977872713, u'based'),
 (0.0017842818636374849, u'behavioral'),
 (0.0067764102067799882, u'big'),
 (0.0019287376857046283, u'bio'),
 (0.007288895567317012, u'bioinformatics'),
 (0.0, u'biology'),
 (0.0, u'biostatistician'),
 (0.0014140262730011424, u'biostatistics'),
 (0.0036770074376959996, u'business'),
 (0.0013223116938207576, u'cardiology

#### Use cross-validation in scikit-learn to evaluate the model above. 
- Evaluate the accuracy of the model, as well as any other metrics you feel are appropriate. 

In [269]:
from sklearn.model_selection import GridSearchCV

In [271]:
# setting up parameters to grid search
n_estimators = [4, 8, 12]
criterion = ['gini', 'entropy']
max_depth = [1, 2, 4, 8]

# establishing model
model = RandomForestClassifier()

# grid searching
grid = GridSearchCV(estimator=model4, cv=3, param_grid=dict(n_estimators = n_estimators, criterion=criterion, max_depth=max_depth))

# fitting model
print(grid.fit(X1, y1))

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [4, 8, 12], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 4, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)


In [272]:
print grid.best_score_
print grid.best_estimator_.criterion
print grid.best_estimator_.n_estimators
print grid.best_estimator_.max_features

0.529411764706
gini
8
auto
