# Preprocess data

* firstly load json files and get the necessary data
* format data
* add necessary columns (version, is_pro, capacity, color)
* save the formatted data

In [1]:
import json
from pathlib import Path
import datetime as dt
import pandas as pd
from functools import reduce


In [2]:
def parse_file_name(file_name: str|Path):
    file_name = file_name.stem if isinstance(file_name, Path) else Path(file_name).stem
    name_parts = file_name.split("_")
    if len(name_parts) == 4:
        version, pro, capacity, color = name_parts
        is_pro = "pro" == pro
    elif len(name_parts) == 3:
        version, capacity, color = name_parts
        is_pro=False
    else:
        raise RuntimeError("Wrong file name was given")
    return version, is_pro, capacity, color
    



In [3]:
base = "datasets/iphone_prices"
dfs = []

df_merged = pd.DataFrame()

for file in Path(base).rglob("*.json"):
    print(file.stem)
    file.name

    with file.open("r", encoding="utf-8") as fp:
        data= json.load(fp)
    
    df = pd.DataFrame(data["response"], columns=["timestamp", "price", "foo"])
    version, is_pro, capacity, color = parse_file_name(file_name=file)
    df["version"] = version
    df["is_pro"] = is_pro
    df["capacity"] = capacity
    df["color"] = color
    df = df.drop(columns=["foo"])
    # df = df[["timestamp", "price"]]
    df["timestamp"] = df["timestamp"].apply( lambda x: dt.datetime.fromtimestamp(x/1000))
    df = df.copy().set_index("timestamp")
    df = df.sort_values(by="timestamp")
    # print(df.head(5))
    
    dfs.append(df)

iphone16_pro_256_sand
iphone16_pro_256_white
iphone16_pro_256_nature
iphone13_pro_256_green
iphone16_pro_256_black
iphone15_pro_256_blue
iphone15_pro_256_white
iphone15_pro_256_nature
iphone15_pro_256_black
iphone14_pro_256_silver
iphone14_pro_256_black
iphone14_pro_256_purple
iphone15_256_green
iphone15_256_blue
iphone15_256_yellow
iphone15_256_black
iphone15_128_yellow
iphone15_128_blue
iphone15_128_black
iphone15_128_green
iphone14_256_yellow
iphone14_256_polarstar
iphone14_256_midnightblack
iphone14_128_blue
iphone14_128_violet
iphone14_128_polarstar
iphone14_256_blue
iphone14_256_violet
iphone14_128_midnightblack
iphone14_128_yellow


In [4]:
df_merged = pd.concat(dfs)
df_merged = df_merged.sort_values(by="timestamp")

df_merged.to_csv("iphones_formatted.csv")