In [2]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from datetime import datetime
import json

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def clean_json(fp, generate_report=False):

    with open(fp, 'r') as f:
        json_blob = json.load(f)

    df = pd.json_normalize(json_blob)
    
    df['yr_make_model'] = df.other.str.split('|').str[0]
    df['year']= df['yr_make_model'].str.split(' ').str[0]
    df['post_expire'] = pd.to_datetime(df['post_expire'])
    df['post_datetime'] = pd.to_datetime(df['post_datetime'])

    col_names = {'offers.availableAtOrFrom.address.addressRegion': 'offers.addressRegion',  #state
                 'offers.availableAtOrFrom.address.postalCode': 'offers.postalCode',
                 'offers.availableAtOrFrom.address.addressLocality': 'offers.addressLocality',   #city
                 'offers.availableAtOrFrom.address.addressCountry': 'offers.addressCountry',
                 'offers.availableAtOrFrom.address.streetAddress': 'offers.streetAddress',
                 'offers.availableAtOrFrom.geo.longitude': 'offers.longitude',
                 'offers.availableAtOrFrom.geo.latitude': 'offers.latitude'
                }
    df.rename(columns=col_names, inplace=True)

    keep_cols = ['post_id', 'post_datetime', 'seller_type',
                 'offers.price', 'offers.priceCurrency', 'year', 'yr_make_model',
                 'VIN', 'fuel', 'type', 'size', 'odometer', 'cylinders', 'drive',
                 'title status', 'transmission', 'paint color',
                 'condition', 'offers.addressRegion', 'offers.addressLocality',
                 'offers.postalCode', 'offers.addressCountry',
                 'offers.streetAddress', 'offers.longitude', 'offers.latitude',
                 'post_expire', 'seller_notes', 'url', 'name', 'description'
                ]
    
    df = (df.filter(keep_cols)
           .loc[df['offers.addressCountry'] == 'US']
           .drop_duplicates()
           .sort_values(by='post_datetime', ignore_index=True)
          )
    
    if generate_report:
        pass
    
    return df

In [106]:
fp = './data/2023_08_01_lansing_data.json'
df = clean_json(fp)
#df.to_csv('craigslist_sample.csv', index=False)

In [7]:
fp = './data/2023_08_20_detriot_data.json'
df = clean_json(fp)

# with open(fp, 'r') as f:
#     json_blob = json.load(f)

# df = pd.json_normalize(json_blob)
# df.columns
df.fuel.value_counts()

fuel
gas       291
diesel      8
other       5
hybrid      3
Name: count, dtype: int64

In [8]:
fp = './data/2023_08_21_sfbay_data.json'
sf_df = clean_json(fp)

# with open(fp, 'r') as f:
#     json_blob = json.load(f)

# df = pd.json_normalize(json_blob)
# df.columns
sf_df.fuel.value_counts()

fuel
gas         280
diesel       25
hybrid       19
electric      6
other         2
Name: count, dtype: int64

In [9]:
### EV only data pull

In [14]:
fp = './data/2023_08_21_detriot_data.json'
df = clean_json(fp)

# with open(fp, 'r') as f:
#     json_blob = json.load(f)

# df = pd.json_normalize(json_blob)
# df.columns
len(df)
df.to_csv('./data/processed/craigslist_det_ev_sample.csv', index=False)

In [15]:
fp = './data/2023_08_21_sfbay_data.json'
sf_df = clean_json(fp)

# with open(fp, 'r') as f:
#     json_blob = json.load(f)

# df = pd.json_normalize(json_blob)
# df.columns
len(sf_df)
sf_df.to_csv('./data/processed/craigslist_sf_ev_sample.csv', index=False)

### generate report

In [120]:
date = datetime.today().strftime('%Y_%m_%d')
city='lansing'

profile = ProfileReport(df.iloc[:,:20], title="Profiling Report")
profile.to_file(f'./reports/{date}_{city}_report.html')

Summarize dataset: 100%|█████████████| 56/56 [00:21<00:00,  2.59it/s, Completed]
Generate report structure: 100%|██████████████████| 1/1 [00:04<00:00,  4.18s/it]
Render HTML: 100%|████████████████████████████████| 1/1 [00:02<00:00,  2.03s/it]
Export report to file: 100%|█████████████████████| 1/1 [00:00<00:00, 154.42it/s]


In [122]:
#df.iloc[:,:20]