# Job Listings

In [1]:
# Dependencies & Setup
import pandas as pd
import numpy as np
import requests
import json
from os.path import exists
import simplejson as json 

# Retrieve Google API Key from config.py
from config_3 import gkey

In [2]:
# File to Load
wc_file = "data/west_coast_job_listings.csv"
ba_file = "data/bay_area_job_listings.csv"

# Read Scraped Data (CSV File) & Store Into Pandas DataFrame
wc_job_listings_df = pd.read_csv(wc_file, encoding="ISO-8859-1")
ba_job_listings_df = pd.read_csv(ba_file, encoding="ISO-8859-1")

In [3]:
# Drop WC NaN's
revised_wc_job_listings_df = wc_job_listings_df.dropna()
revised_wc_job_listings_df.head()

Unnamed: 0,job_title,company,rating,reviews,location,job_description
0,Research Systems Specialist - CTMS,Providence Health & Services,4 out of 5,235,"Portland, OR",Providence St. Joseph Health is calling a Rese...
1,Executive Assistant,Reed Business Information,4 out of 5,64,"Portland, OR",Executive Assistant and Assistant Office Manag...
3,Materials Management Systems Analyst,The Vancouver Clinic,2.9 out of 5,90,"Vancouver, WA 98664",We are growing and need your Supply Chain / Ma...
4,Investigator,Oregon Judicial Department,3.9 out of 5,15,"Salem, OR","$3,829 - $5,599 a monthCommission"
5,Project Manager Talent Development Limited Dur...,"Multnomah County, OR",4.1 out of 5,132,"Portland, OR","$65,115 - $97,673 a yearPart-time, Temporary, ..."


In [13]:
cleaned_wc_job_listings_df = revised_wc_job_listings_df.drop(columns=["rating", "reviews", "job_description"])
cleaned_wc_job_listings_df.head()

Unnamed: 0,job_title,company,location
0,Research Systems Specialist - CTMS,Providence Health & Services,"Portland, OR"
1,Executive Assistant,Reed Business Information,"Portland, OR"
3,Materials Management Systems Analyst,The Vancouver Clinic,"Vancouver, WA 98664"
4,Investigator,Oregon Judicial Department,"Salem, OR"
5,Project Manager Talent Development Limited Dur...,"Multnomah County, OR","Portland, OR"


In [4]:
# Drop BA NaN's
revised_ba_job_listings_df = ba_job_listings_df.dropna()
revised_ba_job_listings_df.head()

Unnamed: 0,company,job_title,location
0,RiskIQ,Data Scientist,"San Francisco, CA, US"
1,Invitae,Data Scientist,"San Francisco, CA, US"
2,Bey,Principal Data Scientist & Data Scientist (Pro...,"San Francisco, CA, US"
3,Automatic,Data Scientist - Automatic Labs,"San Francisco, CA, US"
4,Numerator,Data Scientist,"San Francisco, CA, US"


In [22]:
# Reorganize WC File Column Names
organized_wc_job_listings_df = cleaned_wc_job_listings_df.rename(columns={"company":"Company Name", 
                                                      "job_title":"Job Title",  
                                                      "location":"Location"}) 

# Extract Only Job Titles with "Data" as String
new_organized_wc_job_listings_df = organized_wc_job_listings_df[organized_wc_job_listings_df["Job Title"].
                                                            str.contains("Data", case=True)]
new_organized_wc_job_listings_df.head()

Unnamed: 0,Job Title,Company Name,Location
11,EMS/WIND SCADA Database Engineer,"AVANGRID RENEWABLES, LLC","Portland, OR"
15,Data Scientist,comScore,"Portland, OR 97204"
16,Senior Database Analyst/Administrator,Oregon Judicial Department,"Salem, OR"
17,Research Associate/Research Statistician/Data ...,Oregon Health & Science University,"Portland, OR"
19,Reporting/Data Analyst,Oregon Judicial Department,"Salem, OR"


In [6]:
print(len(new_organized_wc_job_listings_df))

265


In [31]:
# Extract Unique Locations
new_organized_wc_job_listings_df["company_address"] = new_organized_wc_job_listings_df["Company Name"] + ", " + new_organized_wc_job_listings_df["Location"]
unique_locations = new_organized_wc_job_listings_df["company_address"].unique().tolist()

print(len(unique_locations))

148


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
# Reorganize BA File Column Names
organized_ba_job_listings_df = revised_ba_job_listings_df.rename(columns={"company":"Company Name", 
                                                      "job_title":"Job Title", 
                                                      "location":"Location"}) 
organized_ba_job_listings_df.head()

Unnamed: 0,Company Name,Job Title,Location
0,RiskIQ,Data Scientist,"San Francisco, CA, US"
1,Invitae,Data Scientist,"San Francisco, CA, US"
2,Bey,Principal Data Scientist & Data Scientist (Pro...,"San Francisco, CA, US"
3,Automatic,Data Scientist - Automatic Labs,"San Francisco, CA, US"
4,Numerator,Data Scientist,"San Francisco, CA, US"


In [39]:
# Extract Only Company Names to Pass to Google Maps API to Gather GeoCoordinates
company = organized_ba_job_listings_df[["Company Name"]]
company.head()

Unnamed: 0,Company Name
0,RiskIQ
1,Invitae
2,Bey
3,Automatic
4,Numerator


In [40]:
# What are the geocoordinates (latitude/longitude) of the Company Names?
company_list = list(company["Company Name"])
# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/geocode/json"
new_json = []
for target_company in company_list:
#     print(target_company)
    params = {"address": target_company + ", San Francisco CA", "key": gkey}
#     print(params)
#     print("The Geocoordinates of LinkedIn Company Names")

    # Run Request
    response = requests.get(base_url, params=params)
#     print(response.url)

    # Extract lat/lng
    companies_geo = response.json()
    lat = companies_geo["results"][0]["geometry"]["location"]["lat"]
    lng = companies_geo["results"][0]["geometry"]["location"]["lng"]
    new_json.append({"company":target_company,"lat":lat,"lng":lng})
#     print(f"{target_company}, {lat}, {lng}")
print(new_json)

IndexError: list index out of range

In [37]:
# What are the GeoCoordinates (Latitude/Longitude) of the Companies?
# company_list = list(unique_locations["Company Name"])

# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/geocode/json"
new_json = []

counter = 1

for location in unique_locations:
    params = {"address": location, "key": gkey}
    
    # Run Request
    response = requests.get(base_url, params=params)

    try: 
        # Extract lat/lng
        companies_geo = response.json()
        # print(companies_geo)
        lat = companies_geo["results"][0]["geometry"]["location"]["lat"]
        lng = companies_geo["results"][0]["geometry"]["location"]["lng"]
        new_json.append({"company": location,"lat": lat,"lng": lng})
        print(counter)
        counter += 1
    except IndexError:
        print(location)

1
2
3
4
5
6
7
8
9
10
11
12
13
Airbnb
14
15
16
17
18
19
20
21
22
23
24


KeyboardInterrupt: 

In [41]:
# What are the geocoordinates (latitude/longitude) of the Company Names?
company_list = list(company["Company Name"])
# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/geocode/json"
new_json = []
for target_company in company_list:
#     print(target_company)
    params = {"address": target_company + ", San Francisco CA", "key": gkey}
#     print(params)
#     print("The Geocoordinates of LinkedIn Company Names")

    # Run Request
    response = requests.get(base_url, params=params)
#     print(response.url)

    # Extract lat/lng
    companies_geo = response.json()
    lat = companies_geo["results"][0]["geometry"]["location"]["lat"]
    lng = companies_geo["results"][0]["geometry"]["location"]["lng"]
    new_json.append({"company":target_company,"lat":lat,"lng":lng})
#     print(f"{target_company}, {lat}, {lng}")
print(new_json)

IndexError: list index out of range

In [None]:
print(new_json)

In [None]:
# Convert JSON into GeoJSON

geojson = {
    "type": "FeatureCollection",
    "features": [
    {
        "type": "Feature",
        "company": d["company"],
        "geometry" : {
            "type": "Point",
            "coordinates": [d["lat"], d["lng"]],
            },
     } for d in new_json]
}

print(geojson)

In [None]:
job_listing_coordinates = pd.DataFrame(new_json)
job_listing_coordinates

In [None]:
updated_job_listings = job_listings.merge(job_listing_coordinates, how="left", left_on="company_address", right_on="company")
updated_job_listings


# Drop NaN's
updated_job_listings_no_missing = updated_job_listings.dropna()
updated_job_listings_no_missing.head()



In [None]:
updated_job_listings[["company","lat","lng"]].to_dict()

In [None]:
json_job_listings = updated_job_listings[["company","lat","lng"]].to_json(orient="records")
json_job_listings

In [None]:
with open('data.json', 'w') as outfile:
    outfile.write(json_job_listings)