In [2]:
import pandas as pd
import numpy as np
train_y = pd.read_csv('/kaggle/input/alfa-scoring/train_target.csv')


train_data = pd.read_parquet('/kaggle/input/alfa-scoring/train_data_0.pq')
for i in range(1, 12):
    train_data = pd.concat([train_data, pd.read_parquet(f'/kaggle/input/alfa-scoring/train_data_{i}.pq')], ignore_index=True)

In [3]:
# Assuming your DataFrame is named 'df'
for col in train_data.columns:
    if train_data[col].dtype == 'int64':
        if train_data[col].max() < 255:
            train_data[col] = train_data[col].astype('uint8')
        elif train_data[col].max() < 65535:
            train_data[col] = train_data[col].astype('uint16')
        elif train_data[col].max() < 4294967295:
            train_data[col] = train_data[col].astype('uint32')
        else:
            train_data[col] = train_data[col].astype('uint64')


In [5]:
train = train_data.sort_values(by=['id', 'rn'], ascending=[True, True]).groupby("id").tail(5).reset_index().drop("index", axis = 1)
train.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,6,5,0,11,8,12,11,4,2,...,3,3,3,4,1,2,3,1,0,1
1,0,7,3,9,1,2,12,14,15,5,...,3,3,3,4,1,3,4,1,0,0
2,0,8,2,9,2,3,12,14,15,5,...,3,3,3,4,1,3,4,1,0,0
3,0,9,1,9,11,13,14,8,2,5,...,3,3,3,4,1,2,4,1,0,0
4,0,10,7,9,2,10,8,8,16,4,...,3,3,3,4,1,2,4,1,0,0


In [4]:
features = ["pre_since_opened", "pre_since_confirmed", "pre_pterm", "pre_fterm", "pre_till_pclose", "pre_till_fclose",
            "pre_loans_credit_limit", "pre_loans_next_pay_summ", "pre_loans_outstanding", "pre_loans_total_overdue",
            "pre_loans_max_overdue_sum", "pre_loans_credit_cost_rate",
            "pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90",
            "is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90",
            "pre_util", "pre_over2limit", "pre_maxover2limit", "is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit",
            "enc_paym_0", "enc_paym_1", "enc_paym_2", "enc_paym_3", "enc_paym_4", "enc_paym_5", "enc_paym_6", "enc_paym_7", "enc_paym_8",
            "enc_paym_9", "enc_paym_10", "enc_paym_11", "enc_paym_12", "enc_paym_13", "enc_paym_14", "enc_paym_15", "enc_paym_16",
            "enc_paym_17", "enc_paym_18", "enc_paym_19", "enc_paym_20", "enc_paym_21", "enc_paym_22", "enc_paym_23", "enc_paym_24",
            "enc_loans_account_holder_type", "enc_loans_credit_status", "enc_loans_credit_type", "enc_loans_account_cur",
            "pclose_flag", "fclose_flag"]

In [5]:
import torch
import time

timestamp = time.time()

sequences = []
print("Processing IDs:")
total_ids = train['id'].nunique()
for i, (id, group) in enumerate(train.groupby('id')):
    sequences.append(torch.tensor(group.drop(columns=['id', 'rn']).values, dtype=torch.long))
    # Manually print the progress
    if i % 100000 == 0:  # update every 100000 iterations, you can change the frequency of updates here
        print(f"Processed {i}/{total_ids} IDs, time: {time.time()-timestamp}")
print(f"Processed {total_ids}/{total_ids} IDs, time: {time.time()-timestamp}")

Processing IDs:
Processed 0/19260 IDs, time: 0.06031632423400879
Processed 19260/19260 IDs, time: 7.997386932373047


In [6]:
from torch.nn.utils.rnn import pad_sequence
from torch import nn, optim
from tqdm import tqdm

padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

In [7]:
labels = torch.tensor(train_y[:total_ids].drop("id", axis=1).values, dtype=torch.float)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

padded_sequences, labels = padded_sequences.to(device), labels.to(device)

In [8]:
padded_sequences = padded_sequences.to(torch.long)

In [9]:
padded_sequences.shape, labels.shape

(torch.Size([19260, 7, 59]), torch.Size([19260, 1]))

In [10]:
import torch
import torch.nn as nn

class CreditScoringModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dims, hidden_size, num_layers, output_size):
        super(CreditScoringModel, self).__init__()
        
        self.num_embeddings = num_embeddings
        self.embedding_dims = embedding_dims
        
        # Эмбеддинги для категориальных признаков с дополнительным индексом для OOV
        self.embeddings = nn.ModuleList([
            nn.Embedding(num + 1, dim) for num, dim in zip(num_embeddings, embedding_dims)  # +1 для OOV токена
        ])
        
        # Индекс OOV токена - максимальный индекс для каждого эмбеддинга
        self.oov_indices = [num for num in num_embeddings]
        
        # Расчет общей размерности после объединения эмбеддингов
        total_embedding_dim = sum(embedding_dims)
        
        # Инициализация слоя батч нормализации
        self.batch_norm = nn.BatchNorm1d(total_embedding_dim)
        
        # GRU слой
        self.gru = nn.GRU(total_embedding_dim, hidden_size, num_layers, batch_first=True, dropout=0.5)
        
        self.gru_dropout = nn.Dropout(0.2)
        
        # Выходной полносвязный слой
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embeddings = []
        for i, emb in enumerate(self.embeddings):
            # Замена индексов, выходящих за пределы, на OOV индекс для соответствующего эмбеддинга
            x_i = torch.where(x[:,:,i] < self.oov_indices[i], x[:,:,i], torch.full_like(x[:,:,i], self.oov_indices[i]))
            embeddings.append(emb(x_i))
        
        x_embedded = torch.cat(embeddings, dim=2)
        
        # Применение батч нормализации. Поскольку nn.BatchNorm1d ожидает (batch, features),
        # мы временно меняем размерность для батч нормализации, затем возвращаем обратно.
        x_embedded = x_embedded.transpose(1, 2)
        x_embedded = self.batch_norm(x_embedded)
        x_embedded = x_embedded.transpose(1, 2)
        
        gru_out, _ = self.gru(x_embedded)
        last_step_output = gru_out[:,-1,:]
        last_step_output_dropout = self.gru_dropout(last_step_output)
        out = self.fc(last_step_output_dropout)
        
        return out

In [11]:
unique_counts_per_feature = []
for i in range(padded_sequences.size(2)):  # Проходим по каждой колонке
    column = padded_sequences[:,:, i]
    # Исключаем паддинг
    filtered_column = column[column != -1]
    # Подсчитываем уникальные значения
    unique_values = torch.unique(filtered_column)
    unique_counts_per_feature.append(int(unique_values.max()))
    #print(i, unique_values, int(unique_values[-1]))

print("Количество уникальных значений по признакам:", unique_counts_per_feature) # Пример количества уникальных значений для каждого признака

# Расчет размерностей эмбеддингов, понижая в 10 раз, но не меньше минимального порога (например, 2)
embedding_dims = [max(2, (count + 1) // 2) for count in unique_counts_per_feature]
num_embeddings = [count + 1 for count in unique_counts_per_feature]

print("Количество эмбеддингов:", num_embeddings)
print("Размерности эмбеддингов:", embedding_dims)    
    
hidden_size = 128
num_layers = 2
output_size = 1  # Для бинарной классификации

# Инициализация модели
model = CreditScoringModel(num_embeddings, embedding_dims, hidden_size, num_layers, output_size)

# Перенос модели на GPU, если доступно
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Определение функции потерь и оптимизатора
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

Количество уникальных значений по признакам: [19, 17, 17, 16, 16, 15, 19, 6, 5, 0, 3, 13, 16, 18, 8, 4, 14, 1, 1, 1, 1, 1, 19, 19, 19, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 4, 6, 6, 5, 3, 1, 1]
Количество эмбеддингов: [20, 18, 18, 17, 17, 16, 20, 7, 6, 1, 4, 14, 17, 19, 9, 5, 15, 2, 2, 2, 2, 2, 20, 20, 20, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 5, 7, 7, 6, 4, 2, 2]
Размерности эмбеддингов: [10, 9, 9, 8, 8, 8, 10, 3, 3, 2, 2, 7, 8, 9, 4, 2, 7, 2, 2, 2, 2, 2, 10, 10, 10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2]


In [12]:
from torch.utils.data import TensorDataset, DataLoader

# Преобразование данных в тензоры PyTorch
sequences_tensor = torch.tensor(padded_sequences, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.float)

# Создание датасета
dataset = TensorDataset(sequences_tensor, labels_tensor)

# DataLoader для эффективного разбиения данных на батчи
batch_size = 64  # Вы можете адаптировать размер батча к возможностям вашего оборудования
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  sequences_tensor = torch.tensor(padded_sequences, dtype=torch.long)
  labels_tensor = torch.tensor(labels, dtype=torch.float)


In [13]:
for i, emb in enumerate(model.embeddings):
    max_index = emb.num_embeddings - 1
    if sequences_tensor[:,:,i].min() < 0 or sequences_tensor[:,:,i].max() > max_index:
        print(f"Ошибка в индексах для слоя эмбеддинга {i}: min {sequences_tensor[:,:,i].min()}, max {sequences_tensor[:,:,i].max()}, должно быть между 0 и {max_index}")

In [14]:
import torch

class EarlyStopping:
    """Ранняя остановка обучения модели, если валидационная потеря не уменьшается."""
    def __init__(self, patience=5, verbose=False, delta=0):
        """
        Args:
            patience (int): Количество эпох без улучшения после которых обучение будет остановлено.
            verbose (bool): Выводить сообщения при ранней остановке.
            delta (float): Минимальное изменение для расчета как улучшение.
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Сохраняет модель при улучшении валидационной потери."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint_model.pth')
        self.val_loss_min = val_loss


In [15]:
early_stopping = EarlyStopping(patience=2, verbose=True)

In [16]:
from tqdm import tqdm

n_epochs = 30  # Указываем общее количество эпох
def training(n_epochs, model):
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0.0
        print(f"Start Epoch: {epoch+1}")

        # Обертываем dataloader в tqdm для визуализации прогресса
        for sequences_batch, labels_batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
            sequences_batch, labels_batch = sequences_batch.to(device), labels_batch.to(device)
            optimizer.zero_grad()
            outputs = model(sequences_batch)
            loss = criterion(outputs.squeeze(-1), labels_batch.squeeze(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss:.4f}')
        
        early_stopping(avg_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break

In [17]:
training(n_epochs, model)
torch.save(model.state_dict(), 'model_10.pth')

Start Epoch: 1


Epoch 1: 100%|██████████| 301/301 [00:06<00:00, 47.10it/s]


Epoch 1/10, Loss: 0.1403
Validation loss decreased (inf --> 0.140260).  Saving model ...
Start Epoch: 2


Epoch 2: 100%|██████████| 301/301 [00:04<00:00, 60.37it/s]


Epoch 2/10, Loss: 0.1236
Validation loss decreased (0.140260 --> 0.123605).  Saving model ...
Start Epoch: 3


Epoch 3: 100%|██████████| 301/301 [00:05<00:00, 59.95it/s]


Epoch 3/10, Loss: 0.1178
Validation loss decreased (0.123605 --> 0.117809).  Saving model ...
Start Epoch: 4


Epoch 4: 100%|██████████| 301/301 [00:04<00:00, 60.21it/s]


Epoch 4/10, Loss: 0.1095
Validation loss decreased (0.117809 --> 0.109539).  Saving model ...
Start Epoch: 5


Epoch 5: 100%|██████████| 301/301 [00:05<00:00, 57.79it/s]


Epoch 5/10, Loss: 0.0963
Validation loss decreased (0.109539 --> 0.096279).  Saving model ...
Start Epoch: 6


Epoch 6: 100%|██████████| 301/301 [00:05<00:00, 59.95it/s]


Epoch 6/10, Loss: 0.0797
Validation loss decreased (0.096279 --> 0.079733).  Saving model ...
Start Epoch: 7


Epoch 7: 100%|██████████| 301/301 [00:04<00:00, 60.30it/s]


Epoch 7/10, Loss: 0.0564
Validation loss decreased (0.079733 --> 0.056429).  Saving model ...
Start Epoch: 8


Epoch 8: 100%|██████████| 301/301 [00:05<00:00, 60.07it/s]


Epoch 8/10, Loss: 0.0414
Validation loss decreased (0.056429 --> 0.041410).  Saving model ...
Start Epoch: 9


Epoch 9: 100%|██████████| 301/301 [00:05<00:00, 59.84it/s]


Epoch 9/10, Loss: 0.0348
Validation loss decreased (0.041410 --> 0.034806).  Saving model ...
Start Epoch: 10


Epoch 10: 100%|██████████| 301/301 [00:05<00:00, 59.34it/s]

Epoch 10/10, Loss: 0.0250
Validation loss decreased (0.034806 --> 0.024986).  Saving model ...





# EVAL

In [None]:
import pandas as pd
import numpy as np
test_y = pd.read_csv('/kaggle/input/alfa-scoring/test_target.csv')
test1 = pd.read_parquet('/kaggle/input/alfa-scoring/test_data_0.pq')
test2 = pd.read_parquet('/kaggle/input/alfa-scoring/test_data_1.pq')
test_data = pd.concat([test1, test2], ignore_index=True)

In [None]:
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torch import nn, optim
from tqdm import tqdm
import time
import torch

# Assuming your DataFrame is named 'df'
for col in test_data.columns:
    if test_data[col].dtype == 'int64':
        if test_data[col].max() < 255:
            test_data[col] = test_data[col].astype('uint8')
        elif test_data[col].max() < 65535:
            test_data[col] = test_data[col].astype('uint16')
        elif test_data[col].max() < 4294967295:
            test_data[col] = test_data[col].astype('uint32')
        else:
            test_data[col] = test_data[col].astype('uint64')

test = test_data.sort_values(by=['id', 'rn'], ascending=[True, True]).groupby("id").tail(5).reset_index().drop("index", axis = 1)

sequences0 = []
print("Processing IDs:")
total_ids0 = test['id'].nunique()
timestamp = time.time()
for i, (id, group) in enumerate(test.groupby('id')):
    sequences0.append(torch.tensor(group.drop(columns=['id', 'rn']).values, dtype=torch.long))
    # Manually print the progress
    if i % 10000 == 0:  # update every 100 iterations, you can change the frequency of updates here
        print(f"Processed {i}/{total_ids0} IDs time:{time.time()-timestamp}")
        

In [None]:
padded_sequences0 = pad_sequence(sequences0, batch_first=True, padding_value=0)
labels0 = torch.tensor(test_y[:total_ids0].drop("id", axis=1).values, dtype=torch.float)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

padded_sequences0, labels0 = padded_sequences0.to(device), labels0.to(device)
padded_sequences0 = padded_sequences0.to(torch.long)

In [None]:

if 'model' not in locals():
    unique_counts_per_feature = []
    for i in range(padded_sequences0.size(2)):  # Проходим по каждой колонке
        column = padded_sequences0[:,:, i]
        # Исключаем паддинг
        filtered_column = column[column != -1]
        # Подсчитываем уникальные значения
        unique_values = torch.unique(filtered_column)
        unique_counts_per_feature.append(int(unique_values.max()))
        #print(i, unique_values, int(unique_values[-1]))

    print("Количество уникальных значений по признакам:", unique_counts_per_feature) # Пример количества уникальных значений для каждого признака

    # Расчет размерностей эмбеддингов, понижая в 10 раз, но не меньше минимального порога (например, 2)
    embedding_dims = [max(2, (count + 1) // 2) for count in unique_counts_per_feature]
    num_embeddings = [count + 1 for count in unique_counts_per_feature]

    num_embeddings[9], embedding_dims[9] = 2, 2
    num_embeddings[15], embedding_dims[15] = 5, 2
    num_embeddings[56], embedding_dims[56] = 4, 2

    print("Количество эмбеддингов:", num_embeddings)
    print("Размерности эмбеддингов:", embedding_dims)    
    
    hidden_size = 128
    num_layers = 2
    output_size = 1  # Для бинарной классификации

# Инициализация модели
    model = CreditScoringModel(num_embeddings, embedding_dims, hidden_size, num_layers, output_size)

#print(model.embeddings)

    model.load_state_dict(torch.load("/kaggle/input/alfa_gru/pytorch/10ep1/1/model_10.pth"), strict=False)
#model.load_state_dict(torch.load("/kaggle/working/model_10.pth"))

else:
    print("Model already here")
# Перенос модели на GPU, если доступно
model = model.to(device)

In [None]:
from torch.utils.data import TensorDataset, DataLoader
sequences_tensor0 = torch.tensor(padded_sequences0, dtype=torch.long)
labels_tensor0 = torch.tensor(labels0, dtype=torch.float)

test_labels_dataset = TensorDataset(sequences_tensor0, labels_tensor0)
test_labels_dataloader = DataLoader(test_labels_dataset, batch_size=64, shuffle=False)

In [None]:
for i, emb in enumerate(model.embeddings):
    max_index = emb.num_embeddings - 1
    if sequences_tensor0[:,:,i].min() < 0 or sequences_tensor0[:,:,i].max() > max_index:
        print(f"Ошибка в индексах для слоя эмбеддинга {i}: min {sequences_tensor0[:,:,i].min()}, max {sequences_tensor0[:,:,i].max()}, должно быть между 0 и {max_index}")

In [None]:
def eval(model):
# Список для хранения предсказаний и истинных меток
    predictions = []

    model.eval()  # Переводим модель в режим оценки

    with torch.no_grad():  # Отключаем вычисление градиентов
        for sequences_batch, labels_batch in test_labels_dataloader:
            sequences_batch = sequences_batch.to(device)
            labels_batch = labels_batch.to(device)
        
            outputs = model(sequences_batch)
            predicted_probs = torch.sigmoid(outputs.squeeze(-1)).cpu().numpy()  # Получение вероятностей
            predictions.extend(predicted_probs)
        return predictions

In [None]:
pred = eval(model)

In [None]:
predictions = np.array(pred)

submission = pd.DataFrame({
    "id": test_y[:total_ids0]["id"],  # Убедитесь, что этот массив содержит ID тестовых данных в правильном порядке
    "score": predictions
})
submission.to_csv("sample_submission.csv", index=False)