<a href="https://www.kaggle.com/code/akscent/transformer-classifer?scriptVersionId=150690653" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

sys.path.insert(1, '/kaggle/input/ods-huawei/transformer/')
from dataset import *
from model import *
from trainer import Trainer

import torch
from torch.utils.data import Dataset
from typing import Dict
import json
from numpy import asarray
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, AdamW, SGD, Rprop, Adadelta, Adamax, SparseAdam, NAdam, RAdam
from tqdm.notebook import tqdm
from textblob import TextBlob

torch.manual_seed(42)

<torch._C.Generator at 0x7a12ea222d10>

# Data load

In [None]:
train = pd.read_csv('/kaggle/input/ods-huawei/new_train.csv')
f_train = pd.read_csv('/kaggle/input/ods-huawei/feature_train.csv')
train_data = pd.concat([f_train['rate'], train], axis=1)

# Train Test split

In [None]:
train_split, val_split = train_test_split(train_data, test_size=0.25, random_state=42, 
                                          shuffle = True, stratify=train_data['rate'])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.hist(train_split['rate'], bins=10, alpha=0.5, label='Train Split')
plt.hist(val_split['rate'], bins=10, alpha=0.5, label='Validation Split')

plt.xlabel('Rate')
plt.ylabel('Frequency')
plt.legend()
plt.title('Histogram of Rates for Train and Validation Splits')

plt.show()

# Loading tokenizer from pretrained

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "cointegrated/rubert-tiny2", truncation=True, do_lower_case=True)

# Creating train dataset

In [None]:
MAX_LEN = 128
BATCH_SIZE = 64
train_dataset = FiveDataset(train_split, tokenizer, MAX_LEN)
val_dataset = FiveDataset(val_split, tokenizer, MAX_LEN)
train_params = {"batch_size": BATCH_SIZE,
                "shuffle": True,
                "num_workers": 0
                }

test_params = {"batch_size": BATCH_SIZE,
               "shuffle": False,
               "num_workers": 0
               }

train_dataloader = DataLoader(train_dataset, **train_params)
val_dataloader = DataLoader(val_dataset, **test_params)

# Loading pretrained model from Huggingface

In [None]:
config = {
    "num_classes": len(np.unique(train_split['rate'])),
    "dropout_rate": 0.1
}
model = ModelForClassification(
    "cointegrated/rubert-tiny2",
    config=config
)

# Fitting the model

In [None]:
trainer_config = {
    "lr": 3e-4,
    "n_epochs": 2,
    "weight_decay": 1e-6,
    "batch_size": BATCH_SIZE,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "seed": 42,
}
t = Trainer(trainer_config)

t.fit(
    model,
    train_dataloader,
    val_dataloader
)

# Save model

In [None]:
t.save("tf_model.ckpt")

# Load Model

In [None]:
# t = Trainer.load("best_baseline_model.ckpt")

# Get testset predictions


In [None]:
test_data = pd.read_csv('/kaggle/input/ods-huawei/test_cleaned.csv')
test_dataset = FiveDataset(test_data, tokenizer, MAX_LEN)
test_dataloader = DataLoader(test_dataset, **test_params)

In [None]:
predictions = t.predict(test_dataloader)

In [None]:
predicted_classes = [np.argmax(probabilities) + 1 for probabilities in predictions]

# Create submission


In [None]:

sample_submission = pd.read_csv('/kaggle/input/ods-huawei/sample_submission.csv')
sample_submission["rate"] = predicted_classes
# sample_submission.rate = le.inverse_transform(sample_submission.rate)
sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index=False)

# Val conf matrix

In [None]:
predictions_val = t.predict(val_dataloader)

In [None]:
predicted_classes_val = [np.argmax(probabilities) + 1 for probabilities in predictions_val]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

PATH = "/kaggle/input/ods-huawei/"
temp_data = pd.read_csv(os.path.join(PATH, "train.csv"))
le = LabelEncoder()
temp_data.rate = le.fit_transform(temp_data.rate)

def evaluate_classification_metrics(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    print(f"Classification Report for {model_name}:\n", classification_report(y_true, y_pred))
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

evaluate_classification_metrics(predicted_classes_val, le.inverse_transform(val_split['rate']), "val dataset")

# Proba as features

In [None]:
train_dataset = FiveDataset(train_data, tokenizer, MAX_LEN)
train_dataloader = DataLoader(train_dataset, **test_params)
predictions = t.predict(train_dataloader)
train_proba = pd.DataFrame(predictions, columns=['1 star', '2 stars', '3 stars', '4 stars', '5 stars'])
train_proba.to_csv("train_proba_tf.csv", index=False)
print(train_proba)

# Test proba as features

In [None]:
test_data = pd.read_csv('/kaggle/input/ods-huawei/test.csv')
test_dataset = FiveDataset(test_data, tokenizer, MAX_LEN)
test_dataloader = DataLoader(test_dataset, **test_params)
predictions = t.predict(test_dataloader)
test_proba = pd.DataFrame(predictions, columns=['1 star', '2 stars', '3 stars', '4 stars', '5 stars'])
test_proba.to_csv("test_proba_tf.csv", index=False)
print(test_proba)