In [1]:
import polars as pl
from pathlib import Path
import json
from datetime import datetime as dt

## Jobs

In [2]:
f = open(Path("data/jobs.json"))
jobs_json = json.load(f)
jobs_df = pl.DataFrame(jobs_json)

In [3]:
jobs_df.head()

posted_at,id,state,zip,price,company_id
str,i64,str,i64,f64,i64
"""04/13/2022 11:…",1,"""expired""",10506,119.34,4
"""01/20/2021 04:…",2,"""cancelled""",10506,197.89,5
"""07/21/2021 15:…",3,"""expired""",35786,335.85,8
"""06/13/2021 20:…",4,"""expired""",78956,150.0,6
"""06/23/2022 06:…",5,"""expired""",78956,158.09,8


In [4]:
jobs_df.describe()

statistic,posted_at,id,state,zip,price,company_id
str,str,f64,str,f64,f64,f64
"""count""","""102""",102.0,"""102""",102.0,100.0,102.0
"""null_count""","""0""",0.0,"""0""",0.0,2.0,0.0
"""mean""",,50.490196,,45244.637255,324.0005,4.254902
"""std""",,29.474522,,28860.805232,584.589206,2.54343
"""min""","""01/05/2022 15:…",1.0,"""canceled""",10506.0,1.77,0.0
"""25%""",,25.0,,25089.0,131.99,2.0
"""50%""",,51.0,,35786.0,305.84,4.0
"""75%""",,76.0,,78956.0,417.3,6.0
"""max""","""12/30/2022 16:…",100.0,"""posted""",80976.0,5898.0,8.0


In [5]:
jobs_df.schema

OrderedDict([('posted_at', String),
             ('id', Int64),
             ('state', String),
             ('zip', Int64),
             ('price', Float64),
             ('company_id', Int64)])

In [6]:
jobs = (
    jobs_df
        .with_columns(
            date=pl.col("posted_at").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M").cast(pl.Date),
            price=pl.col("price").round(2)
        )
        .with_columns(
            month=pl.col("date").dt.month(),
            year=pl.col("date").dt.year()
        )
).drop("posted_at")

### Drop two null rows where state is "posted" for company id 8 and 3

In [7]:
jobs = jobs.drop_nulls()

### Check for duplicate IDs in the data

In [8]:
jobs.group_by("id").agg(records=pl.count("id")).filter(pl.col("records").gt(1))

id,records
i64,u32
97,2
3,2


In [9]:
jobs = jobs.unique()

### Fix state spelling errors

In [10]:
jobs.group_by("state").agg(pl.count("id"))

state,id
str,u32
"""posted""",23
"""cancelled""",40
"""expired""",33
"""canceled""",1
"""osted""",1


In [11]:
jobs = jobs.with_columns(
    state=(
        pl.when(
            pl.col("state").eq("osted")
        ).then(
            pl.lit("posted")
        ).when(
            pl.col("state").eq("canceled")
        ).then(
            pl.lit("cancelled")
        ).otherwise(pl.col("state"))
    ),
    zip_code=pl.col("zip")
).drop("zip")

In [12]:
jobs.describe()

statistic,id,state,price,company_id,date,month,year,zip_code
str,f64,str,f64,f64,str,f64,f64,f64
"""count""",98.0,"""98""",98.0,98.0,"""98""",98.0,98.0,98.0
"""null_count""",0.0,"""0""",0.0,0.0,"""0""",0.0,0.0,0.0
"""mean""",50.183673,,327.065,4.22449,"""2021-12-17""",6.091837,2021.5,44838.22449
"""std""",28.903446,,589.724716,2.514005,,3.327755,0.502571,28933.351312
"""min""",1.0,"""cancelled""",1.77,0.0,"""2021-01-06""",1.0,2021.0,10506.0
"""25%""",25.0,,131.99,2.0,"""2021-06-13""",3.0,2021.0,25089.0
"""50%""",51.0,,305.84,4.0,"""2022-01-05""",6.0,2022.0,35786.0
"""75%""",75.0,,423.05,6.0,"""2022-06-17""",9.0,2022.0,78956.0
"""max""",100.0,"""posted""",5898.0,8.0,"""2022-12-30""",12.0,2022.0,80976.0


In [24]:
jobs.schema

OrderedDict([('id', Int64),
             ('state', String),
             ('price', Float64),
             ('company_id', Int64),
             ('date', Date),
             ('month', Int8),
             ('year', Int32),
             ('zip_code', Int64)])

In [13]:
jobs.write_parquet(Path("data/cleaned/jobs.parquet"))

## Companies

In [14]:
f = open(Path("data/companies.json"))
companies_dict = json.load(f)
companies_json = [{col: companies_dict[col][str(i)] for col in companies_dict} for i in range(len(companies_dict["Company ID"]))]
companies_df = pl.DataFrame(companies_json)

In [15]:
companies_df.head(10)

Company ID,Company Name,Establishment Date,Number of Employees
i64,str,i64,i64
0,"""Acme Inc.""",1577750400000,404
1,"""Best Corp.""",1616284800000,229
2,"""Bright Future …",1605312000000,222
3,"""Delta Inc.""",1604620800000,662
4,"""Echo Enterpris…",1651708800000,255
5,"""Fast Track Inc…",1583625600000,374
6,"""Global Enterpr…",1643846400000,896
7,"""High Hopes Inc…",1584403200000,812
8,"""Infinite Solut…",1615507200000,436
9,"""Jumpstart Corp…",1588896000000,872


In [16]:
companies_df.describe()

statistic,Company ID,Company Name,Establishment Date,Number of Employees
str,f64,str,f64,f64
"""count""",10.0,"""10""",10.0,10.0
"""null_count""",0.0,"""0""",0.0,0.0
"""mean""",4.5,,1607200000000.0,516.2
"""std""",3.02765,,25306000000.0,270.000329
"""min""",0.0,"""Acme Inc.""",1577800000000.0,222.0
"""25%""",2.0,,1584400000000.0,255.0
"""50%""",5.0,,1605300000000.0,436.0
"""75%""",7.0,,1616300000000.0,812.0
"""max""",9.0,"""Jumpstart Corp…",1651700000000.0,896.0


### Convert UNIX timestamp to human-readable date

In [17]:
companies = (
    companies_df
        .with_columns(
            pl.col("Establishment Date").map_elements(
                lambda x: dt.fromtimestamp(x/1000.0).strftime('%Y-%m-%d'),
                return_dtype=pl.String
            ).str.to_date().alias("Establishment Date")
        )
)

In [18]:
companies = companies.rename({col: col.replace(" ", "_").lower() for col in companies.columns})

In [25]:
companies.schema

OrderedDict([('company_id', Int64),
             ('company_name', String),
             ('establishment_date', Date),
             ('number_of_employees', Int64)])

In [19]:
companies.write_parquet(Path("data/cleaned/companies.parquet"))

## Locations

In [20]:
locations = pl.read_csv(Path("data/locations.csv"))

In [21]:
locations.head()

zip_code,location
i64,str
10506,"""King's Landing…"
80976,"""Dorne"""
78956,"""Braavos"""
67305,"""Harrenhal"""
25089,"""Winterfell"""


In [26]:
locations.schema

OrderedDict([('zip_code', Int64), ('location', String)])

In [22]:
locations.write_parquet(Path("data/cleaned/locations.parquet"))