In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_json("../data/jobs/aug3.json")
df.columns


Index(['job_title', 'company_name', 'location', 'job_type', 'salary_range',
       'benefits', 'department', 'date_posted', 'applicants_count',
       'responsibilities', 'qualifications', 'experience_level',
       'required_education', 'technologies', 'industry', 'company_size',
       'company_description', 'remote_policy', 'job_url', 'additional_notes'],
      dtype='object')

In [3]:
df.info()

df.describe()

df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_title            79 non-null     object
 1   company_name         79 non-null     object
 2   location             79 non-null     object
 3   job_type             79 non-null     object
 4   salary_range         57 non-null     object
 5   benefits             79 non-null     object
 6   department           41 non-null     object
 7   date_posted          79 non-null     object
 8   applicants_count     77 non-null     object
 9   responsibilities     79 non-null     object
 10  qualifications       79 non-null     object
 11  experience_level     76 non-null     object
 12  required_education   21 non-null     object
 13  technologies         79 non-null     object
 14  industry             79 non-null     object
 15  company_size         76 non-null     object
 16  company_de

(79, 20)

In [4]:
df.head()

Unnamed: 0,job_title,company_name,location,job_type,salary_range,benefits,department,date_posted,applicants_count,responsibilities,qualifications,experience_level,required_education,technologies,industry,company_size,company_description,remote_policy,job_url,additional_notes
0,Software Engineer,Coinbase,United States (Remote),"Full-time, Remote","$124,950 USD - $147,000 USD","[Medical, Dental, Vision, 401(k), Target bonus...",Security Platform Engineering,Reposted 1 week ago,Over 100 applicants,[Work on building capabilities and solutions f...,[Passionate about Coinbase's mission to increa...,,,"[Blockchain, Crypto, Web3]",Financial Services,"1,001-5,000 employees","Founded in June of 2012, Coinbase is a digital...",Remote,,Please be advised that each candidate may subm...
1,AI Software Engineer,Brado,"Dallas, TX","Full-time, Remote",,"[Health Care Plan (Medical, Dental & Vision), ...",,2 months ago,Over 100 applicants,[Design and Develop: Build a conversational en...,"[Aligns with our values: People, Commitment, A...",Mid-level,Bachelor's degree in computer science or relat...,"[Python, LLM, Azure, AWS, GCP, REST APIs, Terr...","Marketing Services, Healthcare, AI",51-200 employees,Brado is a consumer engagement firm reinventin...,"Remote, with a preference for candidates in th...",,
2,AI Product Engineer,Lightfield,"San Francisco, CA","Full-time, On-site",,[],,5 days ago,Over 100 applicants,[Collaborate with product leaders to identify ...,[Ability to ramp quickly on a tech stack that ...,,Degree in Computer Science or a related field,"[TypeScript, React, Next.js, Node.js, Apollo G...","Technology, Information and Internet",11-50 employees,Lightfield is a next-generation CRM that autom...,On-site,,"The team previously built Tome, a generative A..."
3,AI Product Engineer,Lightfield,"San Francisco, CA","Full-time, On-site",,[],,5 days ago,Over 100 applicants,[Collaborate with product leaders to identify ...,[Ability to ramp quickly on a tech stack that ...,,Degree in Computer Science or a related field,"[TypeScript, React, Next.js, Node.js, Apollo G...","Technology, Information and Internet",11-50 employees,Lightfield is a next-generation CRM that autom...,On-site,,"The team previously built Tome, a generative A..."
4,Product Engineer,Undisclosed (via Cerebral Valley),"San Francisco, CA","Full-time, On-site",,"[Dental insurance, Medical insurance, Vision i...",,2 days ago,14 applicants,[Shape the core product and reimagine how peop...,[3–8 years of experience shipping zero-to-one ...,Mid-Senior level,,"[Python, TypeScript, Postgres, GPU, LLM, GPT-4...","AI, Software",,This role is for an unnamed partner of Cerebra...,On-site,,This is the first product engineer role at the...


In [5]:
fig = px.histogram(df, x='job_type', title='Distribution of Job Types')
fig.show()

In [6]:
fig = px.histogram(df, x='remote_policy', title='Remote Policy')
fig.show()

In [7]:
fig = px.histogram(df, x='location', title='Location Distribution')
fig.show()


In [8]:
fig = px.histogram(df, x='industry', title='Industry Distribution')
fig.show()


In [9]:
import re

def parse_salary(row):
    if pd.isnull(row) or 'None' in str(row):
        return (None, None)
    match = re.findall(r'\$([\d,]+)', row)
    if len(match) == 2:
        return (int(match[0].replace(',', '')), int(match[1].replace(',', '')))
    elif len(match) == 1:
        return (int(match[0].replace(',', '')), None)
    else:
        return (None, None)

df[['salary_min', 'salary_max']] = df['salary_range'].apply(lambda x: pd.Series(parse_salary(str(x))))


In [10]:
fig = px.histogram(df, x='salary_min', nbins=20, title='Minimum Salary Distribution')
fig.show()
fig = px.histogram(df, x='salary_max', nbins=20, title='Maximum Salary Distribution')
fig.show()


In [11]:
# You may need to clean 'applicants_count' if it's like "Over 100 applicants"
df['applicants_clean'] = df['applicants_count'].str.extract(r'(\d+)').astype(float)

fig = px.histogram(df, x='applicants_clean', nbins=10, title='Applicants Count Distribution')
fig.show()


In [17]:
import ast

def parse_tech_list(val):
    # Defensive checks for empty, nan, or None
    if val is None or (isinstance(val, float) and pd.isnull(val)) or str(val).strip() in ('', '[]', 'nan', 'None'):
        return []
    # Already a list, just return
    if isinstance(val, list):
        return val
    # Try parsing from string representation
    try:
        parsed = ast.literal_eval(val)
        if isinstance(parsed, list):
            return parsed
        else:
            return [str(parsed)]
    except Exception:
        # Fallback, return as a single-element list
        return [str(val)]


tech_list = df['technologies'].apply(parse_tech_list).explode()
tech_counts = tech_list.value_counts().reset_index()
tech_counts.columns = ['Technology', 'Count']

fig = px.bar(tech_counts, x='Technology', y='Count', title='Technology Popularity')
fig.show()


In [18]:
fig = px.histogram(df, x='experience_level', title='Experience Level Distribution')
fig.show()


In [19]:
fig = px.histogram(df, x='company_size', title='Company Size Distribution')
fig.show()


In [20]:
benefits_list = df['benefits'].apply(parse_tech_list).explode()
benefits_counts = benefits_list.value_counts().reset_index()
benefits_counts.columns = ['Benefit', 'Count']

fig = px.bar(benefits_counts.head(15), x='Benefit', y='Count', title='Most Common Benefits')
fig.show()


In [21]:
fig = px.histogram(df, x='department', title='Department Distribution')
fig.show()

fig = px.histogram(df, x='date_posted', title='Date Posted')
fig.show()


In [24]:
df.to_csv('../data/jobs/aug3.csv')