In [1]:
# encoding:utf-8
import os
import pandas as pd
import numpy as np

CURRENT_PATH = os.getcwd()

import copy


def label_handler(data, label_col):
    result = copy.deepcopy(data)
    result["label"] = result[label_col]
    result = result.drop([label_col], axis=1)
    return result


def date_handler(data, date_col):
    result = copy.deepcopy(data)
    result = result.drop([date_col], axis=1)
    return result


def id_handler(data, id_col):
    result = copy.deepcopy(data)
    result = result.drop([id_col], axis=1)
    return result


def drop_const_var(data):
    """
    delete the constant variable
    :param data:
    :return:
    """
    result = data.copy(deep=True)
    for col in data.columns:
        if len(data.loc[~pd.isnull(data[col]), col].unique()) <= 1:
            result.drop(columns=col, inplace=True)
    return result


def drop_not_need_var(data, col):
    result = copy.deepcopy(data)
    result = result.drop([col], axis=1)
    return result


def data_handler(data: pd.DataFrame, col_config: dict, nbins=20, need_dis_con=True):
    cont_vars = []
    dis_vars = []

    for col in col_config.keys():
        col_type = col_config[col]
        if col_type == "id":
            data = id_handler(data, col)

        elif col_type == "drop":
            data = drop_not_need_var(data, col)

        elif col_type == "date":
            data = date_handler(data, col)

        elif col_type == "label":
            data = label_handler(data, col)

        elif col_type == "con":
            cont_vars.append(col)

        elif col_type == "dis":
            dis_vars.append(col)

    print(f"dis_vars is {dis_vars}")
    print(f"cont_vars is {cont_vars}")
    x_dis = data[dis_vars].to_numpy()
    x_cont = data[cont_vars].to_numpy()

    if need_dis_con:
        x_dis_cont, _ = discretize(x_cont, nbins=nbins)
        y = data['label'].to_numpy()
        x_final = np.concatenate([encode_label_mat(x_dis), x_dis_cont], axis=1)
    else:
        x_final = np.concatenate([encode_label_mat(x_dis), x_cont], axis=1)
        y = data['label'].to_numpy()
    return x_final, y


def encode_label(x):
    unique = sorted(list(set([str(item) for item in np.unique(x.astype(str))])))
    kv = {unique[i]: i for i in range(len(unique))}
    vfunc = np.vectorize(lambda x: kv[str(x)])
    return vfunc(x)


def encode_label_mat(x):
    _, ncol = x.shape
    result = np.empty_like(x, dtype=int)
    for col in range(ncol):
        result[:, col] = encode_label(x[:, col])
    return result


def impute_nan(x, method='median'):
    _, ncol = x.shape
    result = np.empty_like(x)

    for col in range(ncol):
        if method == 'median':
            data = x[:, col]
            impute_value = np.median(data[~pd.isnull(data) & (data != np.inf) & (data != -np.inf)])
        else:
            raise NotImplementedError()

        func = np.vectorize(lambda x: impute_value if pd.isnull(x) else x)
        result[:, col] = func(x[:, col])
    return result


def get_uniform_interval(minimum, maximum, nbins):
    result = [minimum]
    step_size = (float(maximum - minimum)) / nbins
    for index in range(nbins - 1):
        result.append(minimum + step_size * (index + 1))
    result.append(maximum)
    return result


def get_interval_v2(x, sorted_intervals):
    if pd.isnull(x):
        return -1
    if x == np.inf:
        return -2
    if x == -np.inf:
        return -3
    interval = 0
    found = False
    sorted_intervals.append(np.inf)
    while not found and interval < len(sorted_intervals) - 1:
        if sorted_intervals[interval] <= x < sorted_intervals[interval + 1]:
            return interval
        else:
            interval += 1


def get_quantile_interval(data, nbins):
    quantiles = get_uniform_interval(0, 1, nbins)
    return list(np.quantile(data[(~pd.isnull(data)) & (data != np.inf) & (data != -np.inf)], quantiles))


def discretize(x, nbins=20):
    nrow, ncol = x.shape
    result = np.empty_like(x)
    interval_list = list()
    for col in range(ncol):
        intervals = sorted(list(set(get_quantile_interval(x[:, col], nbins))))
        interval_centroid = list()

        for i in range(len(intervals) - 1):
            interval_centroid.append(0.5 * (intervals[i] + intervals[i + 1]))
        func = np.vectorize(lambda x: get_interval_v2(x, intervals))
        result[:, col] = encode_label(func(x[:, col]))
        interval_list.append(interval_centroid)
    return result.astype(np.int64), interval_list

In [2]:
df = pd.read_csv("training.csv")
print(df.shape)

(72983, 34)


In [3]:
df = df.iloc[:70000,:]
print(df.shape)

(70000, 34)


In [15]:

data_type_config = {'RefId': 'id',
                    'IsBadBuy': 'label',
                    'PurchDate': 'date',
                    'Auction': 'dis',
                    'VehYear': 'con',
                    'VehicleAge': 'con',
                    'Make': 'dis',
                    'Model': 'drop',
                    'Trim': 'drop',
                    'SubModel': 'drop',
                    'Color': 'dis',
                    'Transmission': 'dis',
                    'WheelTypeID': 'id',
                    'WheelType': 'dis',
                    'VehOdo': 'con',
                    'Nationality': 'dis',
                    'Size': 'dis',
                    'TopThreeAmericanName': 'dis',
                    'MMRAcquisitionAuctionAveragePrice': 'con',
                    'MMRAcquisitionAuctionCleanPrice': 'con',
                    'MMRAcquisitionRetailAveragePrice': 'con',
                    'MMRAcquisitonRetailCleanPrice': 'con',
                    'MMRCurrentAuctionAveragePrice': 'con',
                    'MMRCurrentAuctionCleanPrice': 'con',
                    'MMRCurrentRetailAveragePrice': 'con',
                    'MMRCurrentRetailCleanPrice': 'con',
                    'PRIMEUNIT': 'dis',
                    'AUCGUART': 'dis',
                    'BYRNO': 'id',
                    'VNZIP1': 'drop',
                    'VNST': 'dis',
                    'VehBCost': 'con',
                    'IsOnlineSale': 'dis',
                    'WarrantyCost': 'con'}

df = drop_const_var(df).sample(frac=1).reset_index(drop=True)
X, y = data_handler(df, data_type_config, nbins=50)

dis_vars is ['Auction', 'Make', 'Color', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'PRIMEUNIT', 'AUCGUART', 'VNST', 'IsOnlineSale']
cont_vars is ['VehYear', 'VehicleAge', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehBCost', 'WarrantyCost']


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2/7, random_state=128)

In [21]:
np.sum(y_train)/y_train.shape[0]

0.12232

In [22]:
np.sum(y_test)/y_test.shape[0]

0.1199

In [24]:
train_data = pd.DataFrame(X_train,columns=['Auction', 'Make', 'Color', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'PRIMEUNIT', 'AUCGUART', 'VNST', 'IsOnlineSale' ,'VehYear', 'VehicleAge', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehBCost', 'WarrantyCost'])
train_data["label"] = y_train
print(train_data.shape)

test_data = pd.DataFrame(X_test,columns=['Auction', 'Make', 'Color', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'PRIMEUNIT', 'AUCGUART', 'VNST', 'IsOnlineSale' ,'VehYear', 'VehicleAge', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehBCost', 'WarrantyCost'])
test_data["label"] = y_test
print(test_data.shape)

train_data.to_csv("train_data.csv",index=False)
test_data.to_csv("test_data.csv",index=False)

(50000, 26)
(20000, 26)
