In [125]:
import pandas as pd
from datetime import date

pd.options.display.max_columns = None
pd.options.display.max_rows = 250

df = pd.read_csv("workspace/scrape/data/waymo_jobs_raw.csv")

In [126]:
def phrase_drop(df, col, phrase):
    to_drop = df[col].str.contains(phrase, na=False).sum()
    print(f"Removing {to_drop} rows due to {phrase} in {col}. Rows remaining: {len(df) - to_drop}")
    return df[~df[col].str.contains(phrase, na=False)]

In [127]:
df = phrase_drop(df, "Title", "PhD")
df = phrase_drop(df, "Title", "Principal")
df = phrase_drop(df, "Title", "Director")
df = phrase_drop(df, "Title", "Manager")
df = phrase_drop(df, "Title", "Android")
df = phrase_drop(df, "Title", "Senior Research Scientist")

Removing 3 rows due to PhD in Title. Rows remaining: 210
Removing 4 rows due to Principal in Title. Rows remaining: 206
Removing 5 rows due to Director in Title. Rows remaining: 201
Removing 16 rows due to Manager in Title. Rows remaining: 185
Removing 6 rows due to Android in Title. Rows remaining: 179
Removing 3 rows due to Senior Research Scientist in Title. Rows remaining: 176


In [128]:
# Split Title on first '-' OR ',' into two columns
_parts = df["Title"].astype(str).str.split(r"\s*[-,]\s*", n=1, expand=True)

# Avoid NaNs when a title has no '-' / ','
df["Role"] = _parts[0].fillna("").astype(str).str.strip() if _parts.shape[1] > 0 else ""
df["Team"] = _parts[1].fillna("").astype(str).str.strip() if _parts.shape[1] > 1 else ""
df = df.drop(columns=["Title"])

In [129]:
# Reorder columns: Role, Team, URL, Salary first; everything else after
first = [c for c in ["Role", "Team", "URL"] if c in df.columns]
rest = [c for c in df.columns if c not in first]
df = df[first + rest]

df.head(10)

Unnamed: 0,Role,Team,URL,Department,Location
0,Analysis Infra SWE,,https://careers.withwaymo.com/jobs/analysis-in...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA"
3,Data Engineer,,https://careers.withwaymo.com/jobs/data-engine...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA"
4,Data Scientist,,https://careers.withwaymo.com/jobs/data-scient...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA | SAN FRANCISCO, CAL..."
5,Data Scientist,,https://careers.withwaymo.com/jobs/data-scient...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA | SAN FRANCISCO, CAL..."
13,Fullstack Engineer,Waymo Applications and Tools,https://careers.withwaymo.com/jobs/fullstack-e...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA"
14,Infra SWE Query & Storage,,https://careers.withwaymo.com/jobs/infra-swe-q...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA"
15,Machine Learning Engineer,,https://careers.withwaymo.com/jobs/machine-lea...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA"
16,Machine Learning Engineer,Mapping,https://careers.withwaymo.com/jobs/machine-lea...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA"
17,Machine Learning Engineer,Mapping,https://careers.withwaymo.com/jobs/machine-lea...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA"
19,Machine Learning Engineer,ADV Systems,https://careers.withwaymo.com/jobs/machine-lea...,SOFTWARE ENGINEERING,"MOUNTAIN VIEW, CALIFORNIA | SAN FRANCISCO, CAL..."


In [130]:
df.to_csv(f"workspace/scrape/data/waymo_jobs_{date.today().isoformat()}.csv", index=False)