In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e7/sample_submission.csv
/kaggle/input/playground-series-s4e7/train.csv
/kaggle/input/playground-series-s4e7/test.csv


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [5]:
# 데이터프레임 복사
import pandas as pd

def derived_variables(df_input):
    df = df_input.copy()

    df = df.drop(columns=['id'])
    # 데이터 타입 변환
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)

    # Vehicle_Damage 변환: Yes -> 1, No -> 0
    df['Vehicle_Damage'] = df['Vehicle_Damage'].apply(lambda x: 1 if x == 'Yes' else 0).astype(int)
    df['Previously_Insured'] = df['Previously_Insured'].astype(int)

    # 1. Insured_Vintage: Previously_Insured와 Vintage의 곱
    df['Insured_Vintage'] = df['Previously_Insured'] * df['Vintage']

    # 2. Region_Risk: 각 Region_Code 별 평균 Vehicle_Damage 비율
    region_risk = df.groupby('Region_Code')['Vehicle_Damage'].mean()
    df['Region_Risk'] = df['Region_Code'].map(region_risk)

    df = df.drop(columns=['Previously_Insured', 'Region_Code'])
    return df

def normalize_train_data(train):
    train_df = train.copy()
    # 수치형 데이터만 선택
    column_to_str = ['Driving_License', 'Vehicle_Damage','Response']

    for col in column_to_str:
        train_df[col] = train_df[col].astype(str)
    numeric_cols = train_df.select_dtypes(include=['int64', 'float64', 'int32']).columns
    
    # 평균과 표준편차를 저장할 딕셔너리
    stats = {}
    
    # 각 수치형 열에 대해 평균과 표준편차 계산 후 정규화 수행
    for col in numeric_cols:
        mean = train_df[col].mean()
        std = train_df[col].std()
        train_df[col] = (train_df[col] - mean) / std
        stats[col] = (mean, std)
    
    return train_df, stats

def normalize_test_data(test, stats):
    test_df = test.copy()
    # 수치형 데이터만 선택
    column_to_str = ['Driving_License', 'Vehicle_Damage', 'Response']

    for col in column_to_str:
        test_df[col] = test_df[col].astype(str)

    numeric_cols = test_df.select_dtypes(include=['int64', 'float64', 'int32']).columns
    
    # 평균과 표준편차를 사용하여 정규화
    for col in numeric_cols:
        mean, std = stats.get(col, (0, 1))  # 기본값으로 0, 1을 사용
        test_df[col] = (test_df[col] - mean) / std
    
    return test_df

def one_hot_encode(df_input, columns = ['Gender', 'Driving_License', 'Vehicle_Age', 'Vehicle_Damage']):
    df = df_input.copy()
    # 지정된 열의 데이터 타입을 문자열로 변환
    for col in columns:
        df[col] = df[col].astype(str)
    
    # Train 데이터에서 원핫 인코딩 수행
    df_encoded = pd.get_dummies(df, columns=columns, drop_first=True)
    
    return df_encoded

def convert_bool_to_numeric(df):
    # DataFrame 복사본을 생성하여 원본 데이터를 변경하지 않음
    df_converted = df.copy()
    
    # 모든 컬럼을 순회하며 bool 타입의 컬럼을 찾고, 해당 컬럼을 int 타입으로 변환
    for col in df_converted.columns:
        if df_converted[col].dtype == bool:
            df_converted[col] = df_converted[col].astype(int)
    
    return df_converted

In [6]:
train = pd.DataFrame(pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv"))

In [7]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,11504798.0,5752398.0,3321149.0,0.0,2876199.25,5752398.5,8628597.75,11504797.0
Age,11504798.0,38.38356,14.99346,20.0,24.0,36.0,49.0,85.0
Driving_License,11504798.0,0.998022,0.0444312,0.0,1.0,1.0,1.0,1.0
Region_Code,11504798.0,26.41869,12.99159,0.0,15.0,28.0,35.0,52.0
Previously_Insured,11504798.0,0.4629966,0.4986289,0.0,0.0,0.0,1.0,1.0
Annual_Premium,11504798.0,30461.37,16454.75,2630.0,25277.0,31824.0,39451.0,540165.0
Policy_Sales_Channel,11504798.0,112.4254,54.03571,1.0,29.0,151.0,152.0,163.0
Vintage,11504798.0,163.8977,79.97953,10.0,99.0,166.0,232.0,299.0
Response,11504798.0,0.1229973,0.3284341,0.0,0.0,0.0,0.0,1.0


In [8]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [9]:
# Define column
less = []
for i in train.columns[1:]:
    if train[i].nunique() < 10:
        less.append(i)
        
print('Column have LESS than 10 unique values: ', less)
print('Column have MORE than 10 unique values: ', [i for i in train.columns if i not in less])

# Change to categorical columns for faster analysis:
for i in less:
    train[i] = train[i].astype('category')
    
from pandas.api.types import CategoricalDtype

# Define the new order of categories
new_categories = ['< 1 Year', '1-2 Year', '> 2 Years']

# Create a new CategoricalDtype with the desired order
new_dtype = CategoricalDtype(categories=new_categories, ordered=True)

# Update the 'Vehicle_Age' column with the new dtype
train['Vehicle_Age'] = train['Vehicle_Age'].astype(new_dtype)

Column have LESS than 10 unique values:  ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Response']
Column have MORE than 10 unique values:  ['id', 'Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']


In [10]:
# Train 데이터프레임 파생변수 만들기
train = derived_variables(train)

In [11]:
column_to_str = ['Driving_License', 'Vehicle_Damage','Response']

for col in column_to_str:
    train[col] = train[col].astype(str)
numeric_cols = train.select_dtypes(include=['int64', 'float64', 'int32']).columns

# 평균과 표준편차를 저장할 딕셔너리
stats = {}

# 각 수치형 열에 대해 평균과 표준편차 계산 후 정규화 수행
for col in numeric_cols:
    mean = train[col].mean()
    std = train[col].std()
    train[col] = (train[col] - mean) / std
    stats[col] = (mean, std)

In [12]:
train = one_hot_encode(train)

In [13]:
train = convert_bool_to_numeric(train)

In [14]:
import gc
# 가비지 컬렉션 수행 전, 객체 수 확인
print("Before GC:", len(gc.get_objects()))

# 가비지 컬렉션 수행
gc.collect()

# 가비지 컬렉션 수행 후, 객체 수 확인
print("After GC:", len(gc.get_objects()))

Before GC: 213715
After GC: 213564


In [15]:
import torch
print('CUDA:',torch.version.cuda)

cudnn = torch.backends.cudnn.version()
cudnn_major = cudnn // 1000
cudnn = cudnn % 1000
cudnn_minor = cudnn // 100
cudnn_patch = cudnn % 100
print( 'cuDNN:', '.'.join([str(cudnn_major),str(cudnn_minor),str(cudnn_patch)]) )

CUDA: 12.1
cuDNN: 8.9.0


In [None]:
!pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.6.* cuml-cu12==24.6.*

In [17]:
!pip uninstall -y cupy cupy-cuda12x
!pip install cupy-cuda12x

Found existing installation: cupy 13.2.0
Uninstalling cupy-13.2.0:
  Successfully uninstalled cupy-13.2.0
Found existing installation: cupy-cuda12x 13.2.0
Uninstalling cupy-cuda12x-13.2.0:
  Successfully uninstalled cupy-cuda12x-13.2.0
Collecting cupy-cuda12x
  Using cached cupy_cuda12x-13.2.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (2.7 kB)
Using cached cupy_cuda12x-13.2.0-cp310-cp310-manylinux2014_x86_64.whl (89.5 MB)
Installing collected packages: cupy-cuda12x
Successfully installed cupy-cuda12x-13.2.0


In [None]:
import cudf
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.metrics import accuracy_score
from imblearn.under_sampling import EditedNearestNeighbours
from cuml.preprocessing import SMOTE
import pandas as pd

random_state = 777
n_streams = 1

# 데이터 로드 및 변환
features = train.drop(columns=['Response']).astype('float32')
target = train['Response'].astype('float32')

# 학습 데이터와 테스트 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.01, random_state=random_state)

# SMOTE 적용 (GPU 사용)
smote = SMOTE(random_state=random_state)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# SMOTE 적용 후 데이터를 pandas DataFrame으로 변환
X_train_smote = X_train_smote.to_pandas()
y_train_smote = y_train_smote.to_pandas()

# ENN 적용 (CPU 사용)
enn = EditedNearestNeighbours()
X_train_resampled, y_train_resampled = enn.fit_resample(X_train_smote, y_train_smote)

# ENN 적용 후 데이터를 cudf로 변환
X_train_resampled = cudf.DataFrame.from_pandas(X_train_resampled)
y_train_resampled = cudf.Series(y_train_resampled)

# 모델 학습
model = cuRF(random_state=random_state, n_streams=n_streams)
model.fit(X_train_resampled, y_train_resampled)

# 예측 및 성능 평가
y_pred = model.predict(X_val)
accuracy_all = accuracy_score(y_val, y_pred)
print(f'Model Accuracy with All Variables: {accuracy_all}')

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

random_state = 777

# GPU 또는 CPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터 로드 및 변환 (이미 정규화된 데이터 사용)
features = train.drop(columns=['Response']).astype('float32').to_numpy()
target = train['Response'].astype('float32').to_numpy()

# 클래스 비율에 맞춰 학습 데이터와 테스트 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.01, random_state=random_state, stratify=target)

# TensorDataset과 DataLoader 생성
train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))

batch_size = 1024

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 다층 퍼셉트론 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 1)
        # self.sigmoid = nn.Sigmoid()  # sigmoid 제거

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        # x = self.sigmoid(x)  # sigmoid 제거
        return x

input_dim = X_train.shape[1]
model = MLP(input_dim).to(device)

# 가중치 초기화 함수 정의
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(init_weights)

# 클래스 가중치 계산
class_counts = torch.bincount(torch.tensor(y_train, dtype=torch.long))
class_weights = 1. / class_counts.float()
sample_weights = class_weights[torch.tensor(y_train, dtype=torch.long)]

# 손실 함수 및 옵티마이저 설정
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # 학습률 조정
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)  # 학습률 감소 스케줄러

scaler = torch.cuda.amp.GradScaler()  # AMP를 위한 GradScaler 초기화

# 모델 및 옵티마이저 저장 함수 정의
def save_checkpoint(model, optimizer, scheduler, scaler, epoch, filename="checkpoint.pth.tar"):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'scaler_state_dict': scaler.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved at epoch {epoch}")

# 모델 및 옵티마이저 로드 함수 정의
def load_checkpoint(filename, model, optimizer, scheduler, scaler):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    scaler.load_state_dict(checkpoint['scaler_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Checkpoint loaded from epoch {epoch}")
    return epoch

# 학습 함수 정의
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, scaler, num_epochs=50, patience=5, save_interval=10):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():  # AMP 적용
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

        # 검증 단계
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                with torch.cuda.amp.autocast():  # AMP 적용
                    outputs = model(inputs)
                    loss = criterion(outputs.squeeze(), labels)
                val_loss += loss.item() * inputs.size(0)
        val_loss /= len(val_loader.dataset)
        print(f'Validation Loss: {val_loss:.4f}')

        # 학습률 스케줄러 업데이트
        scheduler.step(val_loss)

        # 조기 종료 체크
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            save_checkpoint(model, optimizer, scheduler, scaler, epoch)  # 최적의 모델 저장
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("조기 종료(Early Stopping) 조건에 도달했습니다.")
            break

        # 주기적으로 체크포인트 저장
        if epoch % save_interval == 0:
            save_checkpoint(model, optimizer, scheduler, scaler, epoch)

# 모델 학습
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, scaler, num_epochs=50)

# 예측 및 성능 평가
model.eval()
y_pred = []
with torch.no_grad():
    for inputs, _ in val_loader:
        inputs = inputs.to(device)
        with torch.cuda.amp.autocast():  # AMP 적용
            outputs = model(inputs)
        y_pred.extend(outputs.squeeze().cpu().numpy())

# 정확도 계산
y_pred = (np.array(y_pred) > 0.5).astype(int)
accuracy_all = np.mean(y_pred == y_val)
print(f'Model Accuracy with All Variables: {accuracy_all}')

Epoch 1/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 1/50, Loss: 0.3073
Validation Loss: 0.2686


Epoch 2/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 2/50, Loss: 0.2714
Validation Loss: 0.2651


Epoch 3/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 3/50, Loss: 0.2686
Validation Loss: 0.2643


Epoch 4/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 4/50, Loss: 0.2676
Validation Loss: 0.2639


Epoch 5/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 5/50, Loss: 0.2672
Validation Loss: 0.2638


Epoch 6/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 6/50, Loss: 0.2669
Validation Loss: 0.2637


Epoch 7/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 7/50, Loss: 0.2667
Validation Loss: 0.2635


Epoch 8/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 8/50, Loss: 0.2665
Validation Loss: 0.2635


Epoch 9/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 9/50, Loss: 0.2664
Validation Loss: 0.2634


Epoch 10/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 10/50, Loss: 0.2663
Validation Loss: 0.2634


Epoch 11/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 11/50, Loss: 0.2662
Validation Loss: 0.2632


Epoch 12/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 12/50, Loss: 0.2662
Validation Loss: 0.2632


Epoch 13/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 13/50, Loss: 0.2661
Validation Loss: 0.2632


Epoch 14/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 14/50, Loss: 0.2661
Validation Loss: 0.2632


Epoch 15/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 15/50, Loss: 0.2660
Validation Loss: 0.2630


Epoch 16/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 16/50, Loss: 0.2660
Validation Loss: 0.2630


Epoch 17/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 17/50, Loss: 0.2659
Validation Loss: 0.2630


Epoch 18/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 18/50, Loss: 0.2658
Validation Loss: 0.2628


Epoch 19/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 19/50, Loss: 0.2658
Validation Loss: 0.2629


Epoch 20/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 20/50, Loss: 0.2657
Validation Loss: 0.2628


Epoch 21/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 21/50, Loss: 0.2657
Validation Loss: 0.2628


Epoch 22/50:   0%|          | 0/11123 [00:00<?, ?it/s]

Epoch 22/50, Loss: 0.2657
Validation Loss: 0.2628


Epoch 23/50:   0%|          | 0/11123 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [28]:
test = pd.DataFrame(pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv"))

In [29]:
test.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [30]:
stats

{'Age': (38.38356336199905, 14.993458508381062),
 'Annual_Premium': (30461.370410588694, 16454.74520506136),
 'Policy_Sales_Channel': (112.42544188954903, 54.03570777686181),
 'Vintage': (163.89774388042275, 79.97953110341109),
 'Region_Risk': (0.5026797515262756, 0.12233599691893907)}

In [32]:
# Define column
less = []
for i in test.columns[1:]:
    if test[i].nunique() < 10:
        less.append(i)
        
print('Column have LESS than 10 unique values: ', less)
print('Column have MORE than 10 unique values: ', [i for i in test.columns if i not in less])

# Change to categorical columns for faster analysis:
for i in less:
    test[i] = test[i].astype('category')
    
from pandas.api.types import CategoricalDtype

# Define the new order of categories
new_categories = ['< 1 Year', '1-2 Year', '> 2 Years']

# Create a new CategoricalDtype with the desired order
new_dtype = CategoricalDtype(categories=new_categories, ordered=True)

# Update the 'Vehicle_Age' column with the new dtype
test['Vehicle_Age'] = test['Vehicle_Age'].astype(new_dtype)

test = derived_variables(test)

# 수치형 데이터만 선택
column_to_str = ['Driving_License', 'Vehicle_Damage']

for col in column_to_str:
    test[col] = test[col].astype(str)

# 정규화할 수치형 컬럼 선택
numeric_cols = test.select_dtypes(include=['int64', 'float64', 'int32']).columns

# 평균과 표준편차를 사용하여 정규화
for col in numeric_cols:
    if col in stats:
        mean, std = stats[col]
        test[col] = (test[col] - mean) / std
    else:
        print(f"Warning: '{col}' does not have corresponding stats. Skipping normalization for this column.")

test = one_hot_encode(test)
test = convert_bool_to_numeric(test)

Column have LESS than 10 unique values:  ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage']
Column have MORE than 10 unique values:  ['id', 'Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']


In [33]:
import gc
# 가비지 컬렉션 수행 전, 객체 수 확인
print("Before GC:", len(gc.get_objects()))

# 가비지 컬렉션 수행
gc.collect()

# 가비지 컬렉션 수행 후, 객체 수 확인
print("After GC:", len(gc.get_objects()))

Before GC: 901848
After GC: 899920


In [34]:
test.head()

Unnamed: 0,Age,Annual_Premium,Policy_Sales_Channel,Vintage,Insured_Vintage,Region_Risk,Gender_Male,Driving_License_1,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_1
0,-1.226106,-1.691389,0.880428,0.801483,0,0.082865,0,1,1,0,0
1,0.57468,0.426724,0.214202,-0.511353,0,1.291573,1,1,0,0,1
2,0.57468,-1.691389,-1.599414,1.339121,0,0.150175,1,1,0,0,1
3,-1.092714,-0.362167,0.732378,-0.611378,115,0.082865,0,1,1,0,0
4,0.841463,0.222041,0.214202,-0.198773,0,0.930258,1,1,0,0,0


In [36]:
import torch

# GPU 또는 CPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 학습된 모델 로드 함수 정의
def load_model(filename, model, optimizer=None, scheduler=None, scaler=None):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    if scaler:
        scaler.load_state_dict(checkpoint['scaler_state_dict'])
    model.to(device)
    print("Model loaded")

# 다층 퍼셉트론 모델 정의 (학습 코드와 동일)
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 1)
        # self.sigmoid = nn.Sigmoid()  # sigmoid 제거

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        # x = self.sigmoid(x)  # sigmoid 제거
        return x

# 모델 초기화 및 로드
input_dim = test.shape[1]  # test 데이터의 입력 차원
model = MLP(input_dim).to(device)
load_model('checkpoint.pth.tar', model)

# 테스트 데이터 로드 및 변환 (이미 정규화된 데이터 사용 가정)
test_features = test.astype('float32').to_numpy()
test_tensor = torch.tensor(test_features)

# 테스트 데이터 추론
model.eval()
with torch.no_grad():
    test_tensor = test_tensor.to(device)
    with torch.cuda.amp.autocast():  # AMP 적용
        test_outputs = model(test_tensor)
    test_predictions = torch.sigmoid(test_outputs).cpu().numpy()  # sigmoid 적용 후 numpy 변환

# 예측 결과 이진화 (0.5 기준)
test_predictions = (test_predictions > 0.5).astype(int)

print(test_predictions)

[[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [1]]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 예측 및 성능 평가
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        with torch.cuda.amp.autocast():
            outputs = model(inputs)
        predictions = torch.sigmoid(outputs).cpu().numpy()
        y_pred.extend(predictions)
        y_true.extend(labels.cpu().numpy())

# 이진화 예측 결과 (0.5 기준)
y_pred = (np.array(y_pred) > 0.5).astype(int)
y_true = np.array(y_true)

# 성능 지표 계산
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [63]:
train['Response'].value_counts()

Response
0    10089739
1     1415059
Name: count, dtype: int64

In [None]:
pd.DataFrame(test_predictions).value_counts()