# Bay Area Job Listings

In [38]:
# Dependencies & Setup
import pandas as pd
import numpy as np
import requests
import json
from os.path import exists
import simplejson as json 

# Retrieve Google API Key from config.py
from config_3 import gkey

In [39]:
# File to Load
file_to_load = "data/data_analyst_sf_final.csv"

# Read Scraped Data (CSV File) & Store Into Pandas DataFrame
job_listings_df = pd.read_csv(file_to_load, encoding="ISO-8859-1")

In [40]:
# Drop BA NaN's
revised_job_listings_df = job_listings_df.dropna()
revised_job_listings_df.head()

Unnamed: 0,company,job_title,location
0,Bey,(Permanent Job) Data Scientist Product Analytics,"San Francisco, CA, US"
1,Bechtel Corporation,2019 Student Internship (U.S.) - Big Data Anal...,"San Francisco, CA, US"
2,Bechtel Corporation,2019 Student Internship (U.S.) - Big Data Anal...,"Hayward, CA, US"
3,ProClinical,Accountant / Supplier Master Data Analyst,"San Mateo, CA, US"
4,BlackRock,Analyst - Index Data Operations,"San Francisco, US-CA"


In [41]:
# Reorganize BA File Column Names
organized_job_listings_df = revised_job_listings_df.rename(columns={"company":"Company Name", 
                                                      "job_title":"Job Title",  
                                                      "location":"Location"})
organized_job_listings_df.head()

Unnamed: 0,Company Name,Job Title,Location
0,Bey,(Permanent Job) Data Scientist Product Analytics,"San Francisco, CA, US"
1,Bechtel Corporation,2019 Student Internship (U.S.) - Big Data Anal...,"San Francisco, CA, US"
2,Bechtel Corporation,2019 Student Internship (U.S.) - Big Data Anal...,"Hayward, CA, US"
3,ProClinical,Accountant / Supplier Master Data Analyst,"San Mateo, CA, US"
4,BlackRock,Analyst - Index Data Operations,"San Francisco, US-CA"


In [42]:
# Extract Only Job Titles with "Data" as String
new_organized_job_listings_df = organized_job_listings_df[organized_ba_job_listings_df["Job Title"].
                                                            str.contains("Data", case=True)]
new_organized_job_listings_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Company Name,Job Title,Location
0,Bey,(Permanent Job) Data Scientist Product Analytics,"San Francisco, CA, US"
1,Bechtel Corporation,2019 Student Internship (U.S.) - Big Data Anal...,"San Francisco, CA, US"
2,Bechtel Corporation,2019 Student Internship (U.S.) - Big Data Anal...,"Hayward, CA, US"
3,ProClinical,Accountant / Supplier Master Data Analyst,"San Mateo, CA, US"
4,BlackRock,Analyst - Index Data Operations,"San Francisco, US-CA"


In [43]:
print(len(new_organized_job_listings_df))

554


In [72]:
new_organized_job_listings_df.dtypes

Company Name       object
Job Title          object
Location           object
company_address    object
dtype: object

In [74]:
# Extract Unique Locations
new_organized_job_listings_df["company_address"] = new_organized_job_listings_df["Company Name"] + ", " + new_organized_job_listings_df["Location"]
unique_locations = new_organized_job_listings_df["company_address"].unique().tolist()

unique_locations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


['Bey, San Francisco, CA, US',
 'Bechtel Corporation, San Francisco, CA, US',
 'Bechtel Corporation, Hayward, CA, US',
 'ProClinical, San Mateo, CA, US',
 'BlackRock, San Francisco, US-CA',
 'Google, San Francisco, CA, US',
 'Roche, South San Francisco, CA, US',
 'Digitas North America, San Francisco, California',
 'Genentech, South San Francisco, CA, US',
 'RPX Corporation, One Market Plaza Steuart Tower, Suite 1100, San Francisco, CA 94105, US',
 'Evolver, San Francisco, CA, US',
 'Superior Court of CA, County of San Mateo, San Mateo, CA, US',
 'Essex Property Trust, San Mateo, CA, US',
 'Essex Corporation, San Mateo, CA, US',
 'Armanino LLP, San Ramon, California',
 'Impetus, San Francisco, CA, US',
 'ARC Document Solutions, San Ramon, CA, US',
 'Planet Pharma, South San Francisco, CA, US',
 'InfiCare Software Technologies, San Francisco, CA, US',
 'Asian Art Museum, 200 larkin street, san francisco, CA 94102, US',
 'Harnham, San Francisco, CA, US',
 'Apex Systems, San Francisco, Ca

In [78]:
# Extract Only Company Names to Pass to Google Maps API to Gather GeoCoordinates
company = new_organized_job_listings_df[["Company Name"]].astype(str)
company.head()

Unnamed: 0,Company Name
0,Bey
1,Bechtel Corporation
2,Bechtel Corporation
3,ProClinical
4,BlackRock


In [82]:
# What are the geocoordinates (latitude/longitude) of the Company Names?
company_list = list(company["Company Name"])

# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/geocode/json"
new_json = []

for location in company_list:
    params = {"address": location + ", San Francisco CA", "key": gkey}
    
    # Run Request
    response = requests.get(base_url, params=params)

    try: 
        # Extract lat/lng
        companies_geo = response.json()
        lat = companies_geo["results"][0]["geometry"]["location"]["lat"]
        lng = companies_geo["results"][0]["geometry"]["location"]["lng"]
        new_json.append({"company": location,"lat": lat,"lng": lng})
    except IndexError: 
        print(location)

SHIFT.com, San Francisco, CA, US
Zââ¢M Fitness, Redwood City, CA, US
VSCOÂ¬Ã, San Francisco, California, United States
ipsy, San Mateo, CA, US
Pill Club, San Carlos, CA, US
VSCOÂ¬Ã, Oakland, CA, US


In [None]:
# What are the GeoCoordinates (Latitude/Longitude) of the Companies?
# company_list = list(unique_locations["Company Name"])

# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/geocode/json"
new_json = []

counter = 1

for location in unique_locations:
    params = {"address": location, "key": gkey}
    
    # Run Request
    response = requests.get(base_url, params=params)

    try: 
        # Extract lat/lng
        companies_geo = response.json()
        # print(companies_geo)
        lat = companies_geo["results"][0]["geometry"]["location"]["lat"]
        lng = companies_geo["results"][0]["geometry"]["location"]["lng"]
        new_json.append({"company": location,"lat": lat,"lng": lng})
        print(counter)
        counter += 1
    except IndexError:
        print(location)