In [220]:
import pandas as pd
import json
from datetime import datetime
from thefuzz import fuzz, process

# Get the data

In [221]:
with open('./data/ebay_ev_sales_data.json') as f:
    data = json.load(f)

# Check keys collected for the columns

In [222]:
# repeatedly check here
data_keys = set()
for d in data:
    data_keys.update(d.keys())
data_keys

{'',
 'Lug Nuts Seat Type',
 'Stil',
 'Year of Manufacture',
 'Bolt Circle Diameter',
 'Activation Type',
 'Minimum Pressure',
 'Reference OE//OEM Number',
 'Positionnement sur le véhicule',
 'Important',
 'Number of Bulbs',
 'Extended Length',
 'Publication Month',
 'Depth (in)',
 'Country/Region of Manufacture:',
 'Benennung',
 'Shipping Service',
 'Located in',
 'Fit 4',
 'LED Quantity(each bulb)',
 'Required Tool',
 'Spec',
 'Specifics',
 'Gauge',
 'Flange\xa0Bolt\xa0Hole\xa0Quantity',
 'Type:',
 'Power Options',
 'Model/Series',
 'Emissions',
 'Stückzahl',
 'Unit Count',
 'Original/Reproduction',
 'Fits7:',
 'Warranty Length',
 'erminals Type',
 'Quantity Unit',
 'Battery / Circuit Board / Transponder chip',
 'Wire Rated Power',
 'Brake Caliper Bracket Included',
 'Impedance',
 'Size',
 'Country Of Manufacture',
 'Resistance',
 'Lifetime',
 'Unit Size',
 'Original/Licensed Reproduction',
 '\u200eIP Rating',
 'Diagonal Dimension',
 'Rotor Outside Diameter',
 'Features 3',
 'Refund 

In [223]:
# better names to info names
columns_map = {
    "Condition" : "Condition",
    "Price" : "Price",
    "Winning bid" : "Winning bid",
    "Location" : "Located in",
    "Ended" : "Ended",
    "VIN" : 'VIN (Vehicle Identification Number)',
    "Make" : "Make",
    "Model" : "Model",
    "Year" : "Year",
    "Ebay Item ID": "ebay_item_id",
}
columns = list(columns_map.values())

In [224]:
for desired_col in columns_map.keys():
    print(f'\n{desired_col}')
    poss = process.extract(desired_col, data_keys, limit=20, scorer=fuzz.partial_token_sort_ratio)
    print(list(map(lambda x: x[0], poss)))


Condition
['Vehicle Condition', 'Product Condition', 'Conditions and Options', 'Condition and Options', 'Condition', 'Item Condition', 'DescriptionPartCondition', 'Condition:', 'Conditions & Options', 'Condition Description', 'Item condition', 'Position', 'Unit', 'GTIN', 'Activation Type', 'Positionnement sur le véhicule', 'Original/Reproduction', 'Original/Licensed Reproduction', 'Ignition Type', 'Life Cycle Status Description']

Price
['Price', 'IC', 'Shipping Service', 'pieces to be send:', 'Total Number of Pieces', 'Restricted States', 'Kit or single piece', 'Number of Kit Pieces', 'Number of Piece', 'Number of Pieces', 'Numéro de pièce fabricant', 'Number of pieces', 'Number Of Pieces', 'Number Of Pieces:', 'Numéro de pièce OE/OEM', 'Série', 'Pin', 'Serie', 'Electric', 'Topic']

Winning bid

Location
['Blade Location', 'LOCATION', 'Sump Location', 'Mounting Location', 'Location on vehicle', 'Item location:', 'location', 'LOC', 'Installation location', 'Loc', 'Inlet Location', 'Lo

In [225]:
# consolidated name to list of possible names
# USE APPEND NOT REPLACE WHEN APPENDING TO THIS COLUMN!
column_mapper = {

    'condition': ['Vehicle Condition', 'Product Condition', 'Conditions and Options', 'Condition and Options', 'Condition', 'Item Condition', 'Condition:', 'Conditions & Options', 'Condition Description', 'Item condition'],
    'price': ['Price'],
    'winning_bid': ['Winning bid'],
    'location': ['LOCATION', 'Item location:', 'location', 'Location'],
    'ended': ['Ended'],
    'vin': ['VIN (Vehicle Identification Number)', 'Vin', 'VIN #', 'VIN'],
    'make': ['Make & Model:', 'MAKE', 'Car make', 'Displayed Make', 'Make & Model', 'Make', 'Vehicle Make'],
    'model': ['Model/Series', 'Make & Model:', 'Model -2', 'Sub Model', 'Model 02', 'Model 8', 'Modell', 'Model Year Applications', 'Model-5', 'MODEL-YEAR', 'Model Name', 'Models3', 'OTHER MODELS 15', 'Model Number', 'Models2'],
    'year': ['year','YEAR'],
    "ebay_item_id": ["ebay_item_id"]

}

# Get the data based on the columns
- only use rows  with vin (i.e. it is a car)

In [226]:
new_data = []
for ebay_data in data:
    for vin_ebay_col in column_mapper['vin']:
        if vin_ebay_col in ebay_data.keys():
            new_data.append(ebay_data)
            break

In [227]:
len(new_data)

6094

In [228]:
with open('./data/ebay_ev_sales_data_cars.json','w') as f:
    json.dump(new_data, f, indent=4, separators=(',', ': '))

In [229]:
data = new_data

In [230]:
data_arr = []
for ebay_data in data:
    new_dict = {}
    for col, ebay_cols in column_mapper.items():
        for ebay_col in ebay_cols:
            if ebay_col in ebay_data:
                if col in new_dict.keys():
                    new_dict[col] += f'{"|"}{ebay_data[ebay_col]}'
                else:
                    new_dict[col] = ebay_data[ebay_col]
    data_arr.append(new_dict)

In [231]:
df = pd.DataFrame(data_arr)

In [232]:
df

Unnamed: 0,condition,price,ended,vin,make,model,ebay_item_id,winning_bid
0,Used,"|US $48,900.00|Best offer accepted| |This item...","|May 16, 2023|14:03:36 PDT|",5YJXCAE25JF089783,Tesla,Model X 100D 295 FULL CHARGE,256048040264,
1,Used: A vehicle is considered used if it has b...,"|US $42,000.00|","|May 12, 2023|19:33:22 PDT|",5YJXCAE49GF001909,Tesla,,155552048142,
2,Used,"|US $46,900.00|","|May 09, 2023|11:21:20 PDT|",5YJXCAE21HF078337,Tesla,,225550054810,
3,Used: A vehicle is considered used if it has b...,"|US $50,500.00|Best offer accepted| |This item...","|May 09, 2023|09:06:01 PDT|",5YJXCAE20KF155657,Tesla,,256066075946,
4,Used: A vehicle is considered used if it has b...,,"|May 08, 2023|16:23:57 PDT|",5YJXCBE22LF271739,Tesla,,295670955949,"US $63,188.00|"
...,...,...,...,...,...,...,...,...
6089,Used: A vehicle is considered used if it has b...,,,1GCDC14H7EF323237,Chevrolet,,195767015068,
6090,Used: A vehicle is considered used if it has b...,,,00000000000000000,Chevrolet,Pickup,185785486080,
6091,Used: A vehicle is considered used if it has b...,,,CKR147F469782,Chevrolet,,404254562358,
6092,Used: A vehicle is considered used if it has b...,,,CKR147F469782,Chevrolet,,404236233432,


# Some light cleaning
- drop duplicates and items that weren't cars
- just write "used"

In [233]:
#df[~pd.isnull(df.location)]

In [234]:
df.drop_duplicates(subset=['vin','ebay_item_id'],keep='first',inplace=True)
df.shape

(3858, 8)

In [235]:
remove_start = lambda s: s[1:] if pd.notnull(s) and len(s)>1 and s[0]=='|' else s
remove_end = lambda s: s[:-2] if pd.notnull(s) and len(s)>1 and s[-1]=='|' else s

df = df.apply(lambda x: x.apply(remove_start))
df = df.apply(lambda x: x.apply(remove_end))
df.head()

Unnamed: 0,condition,price,ended,vin,make,model,ebay_item_id,winning_bid
0,Used,"US $48,900.00|Best offer accepted| |This item ...","May 16, 2023|14:03:36 PD",5YJXCAE25JF089783,Tesla,Model X 100D 295 FULL CHARGE,256048040264,
1,Used: A vehicle is considered used if it has b...,"US $42,000.0","May 12, 2023|19:33:22 PD",5YJXCAE49GF001909,Tesla,,155552048142,
2,Used,"US $46,900.0","May 09, 2023|11:21:20 PD",5YJXCAE21HF078337,Tesla,,225550054810,
3,Used: A vehicle is considered used if it has b...,"US $50,500.00|Best offer accepted| |This item ...","May 09, 2023|09:06:01 PD",5YJXCAE20KF155657,Tesla,,256066075946,
4,Used: A vehicle is considered used if it has b...,,"May 08, 2023|16:23:57 PD",5YJXCBE22LF271739,Tesla,,295670955949,"US $63,188.0"


In [236]:
df['condition'] = df['condition'].apply(lambda x: 'used' if pd.notnull(x) and 'used' in x.lower() else x)
df.head()

Unnamed: 0,condition,price,ended,vin,make,model,ebay_item_id,winning_bid
0,used,"US $48,900.00|Best offer accepted| |This item ...","May 16, 2023|14:03:36 PD",5YJXCAE25JF089783,Tesla,Model X 100D 295 FULL CHARGE,256048040264,
1,used,"US $42,000.0","May 12, 2023|19:33:22 PD",5YJXCAE49GF001909,Tesla,,155552048142,
2,used,"US $46,900.0","May 09, 2023|11:21:20 PD",5YJXCAE21HF078337,Tesla,,225550054810,
3,used,"US $50,500.00|Best offer accepted| |This item ...","May 09, 2023|09:06:01 PD",5YJXCAE20KF155657,Tesla,,256066075946,
4,used,,"May 08, 2023|16:23:57 PD",5YJXCBE22LF271739,Tesla,,295670955949,"US $63,188.0"


# Write the data

In [237]:
df.to_csv('./data/ebay_ev_sales_data_cleaned.csv', index=False)