In [1]:
!nvidia-smi

Wed Jul 26 22:36:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ===================================================================
#  Library
# ===================================================================
import os
import math
import random
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.simplefilter("ignore")

from tqdm.auto import tqdm

import unicodedata

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [4]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    n_splits = 10
    data_dir = "/content/drive/MyDrive/signate_StudentCup2023/data/"
    target_bins = 10
    year_bins = 10
    save_dir = "/content/drive/MyDrive/signate_StudentCup2023/exp/"
    common_features_1 = ['year_map', "fuel", "condition", "manufacturer"]
    common_features_2 = ['year_map', "fuel", "condition"]
    common_features_3 = ['year_map', "fuel"]
    save_model = False

In [5]:
# ===================================================================
#  Utils
# ===================================================================
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)


def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [6]:
# ===================================================================
#  Data Loading
# ===================================================================
train = pd.read_csv(CFG.data_dir+"train.csv")
test = pd.read_csv(CFG.data_dir+"test.csv")

train["flag"] = "train"
test["flag"] = "test"
all_data = pd.concat([train, test], ignore_index=True)

In [7]:
# ===================================================================
#  preprocessing
# ===================================================================
# year
def pre_year(df: pd.DataFrame):
    year_dict = {
        2999:1999,
        3008:2008,
        3015:2015,
        3017:2017,
        3011:2011,
        3019:2019,
    }
    df["year"] = df["year"].replace(year_dict)
    return df
all_data = pre_year(all_data)


# manufacturer
all_data["manufacturer"] = all_data["manufacturer"].str.lower().apply(lambda x: unicodedata.normalize('NFKC', x))


# fuel
all_data["fuel"].fillna(value=-1, inplace=True)


# size
def pre_size(df: pd.DataFrame):
    size_dict = {
        "fullーsize":"full-size",
        "midーsize":"mid-size",
        "subーcompact":"sub-compact",
        "full−size":"full-size",
        "mid−size":"mid-size"
    }
    df["size"] = df["size"].replace(size_dict)
    return df
all_data = pre_size(all_data)


# title_status
all_data["title_status"].fillna(value=-1, inplace=True)


# type
all_data["type"].fillna(value=-1, inplace=True)


# state
all_data["state"].fillna(value=-1, inplace=True)

In [8]:
# ===================================================================
#  use features
# ===================================================================

## numerical
CFG.numerical_features = ['year','odometer',]


## category
#CFG.categorical_features = ['cylinders', 'title_status', 'transmission', 'drive', "size", 'type','paint_color', 'state', 'manufacturer', 'condition',  'fuel',]
CFG.categorical_features = []

CFG.target_encoding_features = ['region', 'condition', 'cylinders', 'title_status', 'transmission', 'drive',
                                "size", 'type','paint_color', 'state', 'manufacturer', 'condition',  'fuel',]

CFG.count_encoding_features = []


## use_features
CFG.use_features = list(set(CFG.numerical_features) | set(CFG.categorical_features))

In [9]:
# ===================================================================
#  train test split
# ===================================================================
train = all_data[all_data["flag"] == "train"].reset_index(drop=True)
test = all_data[all_data["flag"] == "test"].reset_index(drop=True)

train.sort_values(by="id", ignore_index=True, inplace=True)

train["year_map"], bins = pd.cut(train["year"], bins=CFG.year_bins, labels=False, retbins=True)
test["year_map"] = pd.cut(test["year"], bins=bins, labels=False)
train["price_map"] = pd.cut(train["price"], bins=CFG.target_bins, labels=False)


skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (_, val) in enumerate(skf.split(X=train, y=train["price_map"])):
    train.loc[val, "fold"] = i
print(train["fold"].value_counts())
print(train.groupby("fold")["price"].mean().std())

1.0    2754
0.0    2754
2.0    2753
4.0    2753
6.0    2753
3.0    2753
7.0    2753
8.0    2753
5.0    2753
9.0    2753
Name: fold, dtype: int64
34.38316956667607


In [10]:
# ===================================================================
#  Dataset
# ===================================================================
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.X = df[self.cfg.use_features]
        self.Y = df["price"]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        x = torch.tensor(self.X.iloc[index].values, dtype=torch.float)
        y = torch.tensor(self.Y.iloc[index], dtype=torch.float)
        return x, y

In [11]:
# ===================================================================
#  model
# ===================================================================
class CustomModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.fc =  nn.Sequential(
                          nn.Linear(len(self.cfg.use_features), self.cfg.hidden_size),
                          nn.BatchNorm1d(self.cfg.hidden_size),
                          nn.Dropout(self.cfg.dropout),
                          nn.PReLU(),
                          nn.Linear(self.cfg.hidden_size, self.cfg.hidden_size),
                          nn.BatchNorm1d(self.cfg.hidden_size),
                          nn.Dropout(self.cfg.dropout),
                          nn.PReLU(),
                          nn.Linear(self.cfg.hidden_size, 1),
                          )

    def forward(self, input):
        output = self.fc(input)
        return output

In [12]:
CFG.hidden_size = 512
CFG.dropout = 0

In [13]:
# ===================================================================
#
# ===================================================================
fold = 0
X_train = train[train["fold"] != fold].reset_index(drop=True)
X_valid = train[train["fold"] == fold].reset_index(drop=True)
test_df = test.copy()

# fix odometer
def fix_odometer(df, df_original, keys_1, keys_2, keys_3, grouped_means_1, grouped_means_2, grouped_means_3,):
    #print(df.shape)
    for id_ in df["id"]:
        tmp_1 = tuple(df[df["id"] == id_][CFG.common_features_1].values.tolist()[0])
        tmp_2 = tuple(df[df["id"] == id_][CFG.common_features_2].values.tolist()[0])
        tmp_3 = tuple(df[df["id"] == id_][CFG.common_features_3].values.tolist()[0])

        if tmp_1 in keys_1:
            df_original.loc[df_original["id"] == id_, "odometer"] = grouped_means_1[tmp_1]
        elif tmp_2 in keys_2:
            df_original.loc[df_original["id"] == id_, "odometer"] = grouped_means_2[tmp_2]
        elif tmp_3 in keys_3:
            df_original.loc[df_original["id"] == id_, "odometer"] = grouped_means_3[tmp_3]
        else:
            raise ValueError("")

        return df_original

## 補完するデータたち
right_df = X_train[(X_train["odometer"] > 100)&(X_train["odometer"] < 400000)].reset_index(drop=True)
grouped_means_1 = right_df.groupby(CFG.common_features_1)["odometer"].mean().to_dict()
grouped_means_2 = right_df.groupby(CFG.common_features_2)["odometer"].mean().to_dict()
grouped_means_3 = right_df.groupby(CFG.common_features_3)["odometer"].mean().to_dict()


## 共通点の多いデータで補完する
keys_1 = tuple([k for k, _ in grouped_means_1.items()])
keys_2 = tuple([k for k, _ in grouped_means_2.items()])
keys_3 = tuple([k for k, _ in grouped_means_3.items()])

X_train = fix_odometer(X_train[(X_train["odometer"] < 100)|(X_train["odometer"] > 400000)].reset_index(drop=True), X_train,
                        keys_1, keys_2, keys_3, grouped_means_1, grouped_means_2, grouped_means_3)


X_valid = fix_odometer(X_valid[(X_valid["odometer"] < 100)|(X_valid["odometer"] > 400000)].reset_index(drop=True), X_valid,
                        keys_1, keys_2, keys_3, grouped_means_1, grouped_means_2, grouped_means_3)


test_df = fix_odometer(test_df[(test_df["odometer"] < 100)|(test_df["odometer"] > 400000)].reset_index(drop=True), test_df,
                        keys_1, keys_2, keys_3, grouped_means_1, grouped_means_2, grouped_means_3)

# feature_engineering
#def add_odometer_per_year(df: pd.DataFrame):
#    df["lapsed_years"] = 2023 - df["year"]
#    df["odometer_per_year"] = df["odometer"] / df["lapsed_years"]
#    return df
#X_train = add_odometer_per_year(X_train)
#X_valid = add_odometer_per_year(X_valid)
#test = add_odometer_per_year(test)


# CFG.target_encoding_features
for col in CFG.target_encoding_features:
    mean_map = X_train.groupby(col)["price"].mean()
    X_train[col+"_target_encoding"] = X_train[col].map(mean_map)
    X_valid[col+"_target_encoding"] = X_valid[col].map(mean_map)
    test_df[col+"_target_encoding"] = test_df[col].map(mean_map)
    if fold == 0:
        CFG.use_features.append(col+"_target_encoding")


# CFG.categorical_featuresをcategory型に変換
for col in CFG.categorical_features:
    X_train[col] = X_train[col].astype("category")
    X_valid[col] = X_valid[col].astype("category")
    test_df[col] = test_df[col].astype("category")


# CFG.count_encoding_features
for col in CFG.count_encoding_features:
    count_map = X_train[col].value_counts().to_dict()
    X_train[col+"_count_encoding"] = X_train[col].map(count_map)
    X_valid[col+"_count_encoding"] = X_valid[col].map(count_map)
    test_df[col+"_count_encoding"] = test_df[col].map(count_map)
    if fold == 0:
        CFG.use_features.append(col+"_count_encoding")

In [14]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [32]:
CFG.max_grad_norm=1000
CFG.gradient_accumulation_steps=1
CFG.lr=2e+5
CFG.weight_decay=1e-6
CFG.batch_size=320
CFG.epochs=20
CFG.num_workers = 0

In [33]:
!pip install transformers



In [34]:
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from torch.optim import AdamW

In [35]:
CFG.eps=1e-6
CFG.betas=(0.9, 0.999)

In [36]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [37]:
CFG.scheduler = "cosine"
CFG.num_warmup_steps = 0
CFG.num_cycles = 0.5

In [38]:
# ====================================================
# loader
# ====================================================
train_dataset = TrainDataset(CFG, X_train)
valid_dataset = TrainDataset(CFG, X_valid)

train_loader = DataLoader(train_dataset,
                          batch_size=CFG.batch_size,
                          shuffle=True,
                          num_workers=CFG.num_workers,
                          pin_memory=True,
                          drop_last=True)
valid_loader = DataLoader(valid_dataset,
                          batch_size=CFG.batch_size,
                          shuffle=False,
                          num_workers=CFG.num_workers,
                          pin_memory=True,
                          drop_last=False)


def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
    return scheduler


model = CustomModel(CFG).to(device)
if CFG.save_model:
    torch.save(model.config, CFG.output_dir+'config.pth')
model.to(device)


optimizer = AdamW(model.parameters(), lr=CFG.lr, eps=CFG.eps, betas=CFG.betas)

num_train_steps = int(len(X_train) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [39]:
CFG.use_features

['year',
 'odometer',
 'region_target_encoding',
 'condition_target_encoding',
 'cylinders_target_encoding',
 'title_status_target_encoding',
 'transmission_target_encoding',
 'drive_target_encoding',
 'size_target_encoding',
 'type_target_encoding',
 'paint_color_target_encoding',
 'state_target_encoding',
 'manufacturer_target_encoding',
 'condition_target_encoding',
 'fuel_target_encoding']

In [40]:
class MAPELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        absolute_percentage_error = torch.abs((y_true - y_pred) / (y_true + self.eps))
        loss = 100.0 * torch.mean(absolute_percentage_error)

        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [41]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(inputs)
        loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scheduler.step()
            optimizer.zero_grad()
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg

In [42]:
CFG.print_freq = 100

In [43]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [48]:
criterion = MAPELoss(reduction="mean")
criterion = nn.MSELoss(reduction="mean")

best_score = np.inf

In [49]:
import time

In [50]:
CFG.batch_scheduler = True

In [None]:
for epoch in range(CFG.epochs):
    start_time = time.time()

    # train
    avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

    # eval
    avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

    # scoring
    score = get_score(X_valid["price"], predictions)

    elapsed = time.time() - start_time

    print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
    print(f'Epoch {epoch+1} - Score: {score:.4f}')


    if best_score > score:
        best_score = score
        print('\033[32m'+f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'+'\033[0m')
        if CFG.save_model:
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

Epoch: [1][0/77] Elapsed 0m 0s (remain 0m 5s) Loss: 250991808.0000(250991808.0000) Grad: 170466.7188  LR: 47214.75932879  
Epoch: [1][76/77] Elapsed 0m 6s (remain 0m 0s) Loss: 317907424.0000(301826854.2338) Grad: 181334.3281  LR: 23919.87681457  
EVAL: [0/9] Elapsed 0m 0s (remain 0m 0s) Loss: 351564672.0000(351564672.0000) 
EVAL: [8/9] Elapsed 0m 1s (remain 0m 0s) Loss: 249150224.0000(302035129.7371) 
Epoch 1 - avg_train_loss: 301826854.2338  avg_val_loss: 302035129.7371  time: 8s
Epoch 1 - Score: 100.0041
[32mEpoch 1 - Save Best Score: 100.0041 Model[0m
Epoch: [2][0/77] Elapsed 0m 0s (remain 0m 10s) Loss: 325165184.0000(325165184.0000) Grad: 190817.3438  LR: 23657.08738846  
Epoch: [2][76/77] Elapsed 0m 7s (remain 0m 0s) Loss: 279304384.0000(301677325.9221) Grad: 178113.0469  LR: 7650.92620143  
EVAL: [0/9] Elapsed 0m 0s (remain 0m 0s) Loss: 351565216.0000(351565216.0000) 
EVAL: [8/9] Elapsed 0m 0s (remain 0m 0s) Loss: 249150640.0000(302035625.6790) 
Epoch 2 - avg_train_loss: 301677