In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import gc

# Load the dataset
df = pd.read_csv("/kaggle/input/checkworthiness/checkworthiness_labeled.csv")
df_cleaned = df.dropna()  # Remove rows with any missing values
df = df_cleaned

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input texts
tokenized_texts = tokenizer(df['Text'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Convert labels to tensor
label_map = {'Yes': 1, 'No': 0}  # Define mapping for categories to integers
labels = torch.tensor(df['Category'].map(label_map).tolist())

class BertCNNClassifier(nn.Module):
    def __init__(self, num_classes, dropout_prob=0.1, bert_model_name='bert-base-uncased', cnn_out_channels=128, cnn_kernel_sizes=(2, 3, 4), dropout_p=0.1):
        super(BertCNNClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # CNN layers with batch normalization and dropout
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=cnn_out_channels, kernel_size=k),
                nn.BatchNorm1d(cnn_out_channels),
                nn.ReLU(),
                nn.Dropout(dropout_p)
            )
            for k in cnn_kernel_sizes
        ])
        
        # Fully connected layer with dropout
        self.fc = nn.Sequential(
            nn.Linear(len(cnn_kernel_sizes) * cnn_out_channels, num_classes),
            nn.Dropout(dropout_prob)
        )
        
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]  # Output of BERT last layer
        
        # Permute to have the shape (batch_size, hidden_size, sequence_length)
        bert_output = bert_output.permute(0, 2, 1)
        
        # Apply CNNs and pool over time
        conv_outputs = [conv(bert_output) for conv in self.convs]
        pooled_outputs = [torch.max(conv_output, dim=2)[0] for conv_output in conv_outputs]
        
        # Concatenate pooled outputs
        cat_output = torch.cat(pooled_outputs, dim=1)
        
        # Fully connected layer
        output = self.fc(cat_output)
        
        return output

# Initialize the model and move it to GPU if available
num_classes = 2  # Assuming binary classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertCNNClassifier(num_classes=num_classes).to(device)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(tokenized_texts['input_ids'], labels, test_size=0.1, random_state=42)
train_masks, val_masks, _, _ = train_test_split(tokenized_texts['attention_mask'], tokenized_texts['attention_mask'], test_size=0.1, random_state=42)

# Convert to PyTorch tensors and move to GPU
train_texts = train_texts.to(device)
val_texts = val_texts.to(device)
train_masks = train_masks.to(device)
val_masks = val_masks.to(device)
train_labels = train_labels.to(device)
val_labels = val_labels.to(device)

# DataLoaders
batch_size = 8
train_dataset = TensorDataset(train_texts, train_masks, train_labels)
val_dataset = TensorDataset(val_texts, val_masks, val_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Loss function and optimizer with weight decay
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-5)

# Accuracy Functions
def get_accuracy_per_batch(oglabels, predlabels):
    correct = (predlabels == oglabels).sum().item()
    total = len(oglabels)
    accuracy = correct / total
    return accuracy

def get_total_accuracy(acc_list):
    return sum(acc_list) / len(acc_list)

# Training function
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []
    accuracy = 0
    total = len(train_loader)
    acc_list = []
    for i, batch in enumerate(train_loader):
        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss / (total * batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # Push the batch to GPU
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        # Clear previously calculated gradients
        optimizer.zero_grad()
        # Get model predictions for the current batch
        preds = model(sent_id.long(), mask)
        predicted_labels = torch.argmax(preds, dim=1)
        accuracy = get_accuracy_per_batch(labels, predicted_labels)
        acc_list.append(accuracy)
        # Compute the loss between actual and predicted values
        loss = criterion(preds, labels)
        # Add on to the total loss
        total_loss += float(loss.item())
        # Backward pass to calculate the gradients
        loss.backward()
        # Clip the gradients to prevent the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters
        optimizer.step()
        # Model predictions are stored on GPU. So, push it to CPU and append the model predictions
        total_preds.append(preds.detach().cpu().numpy())
        
    # Compute the training loss of the epoch
    avg_loss = total_loss / (len(train_loader) * batch_size)
    # Average accuracy
    total_accuracy = get_total_accuracy(acc_list)
    # Reshape the predictions
    total_preds = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds, total_accuracy

# Evaluation function
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss, total_accuracy = 0, 0
    accuracy = 0
    total_preds = []
    acc_list = []
    total = len(val_loader)
    for i, batch in enumerate(val_loader):
        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss / (total * batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # Push the batch to GPU
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        # Deactivate autograd
        with torch.no_grad():
            # Model predictions
            preds = model(sent_id, mask)
            # Compute the validation loss between actual and predicted values
            loss = criterion(preds, labels)
            total_loss += float(loss.item())
            total_preds.append(preds.detach().cpu().numpy())
            predicted_labels = torch.argmax(preds, dim=1)
            accuracy = get_accuracy_per_batch(labels, predicted_labels)
            acc_list.append(accuracy)
    # Compute the validation loss of the epoch
    avg_loss = total_loss / (len(val_loader) * batch_size)
    # Average accuracy
    total_accuracy = get_total_accuracy(acc_list)
    # Reshape the predictions
    total_preds = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds, total_accuracy

print(device)

# Training and Validation loop
best_accuracy = 0.0  
best_model_state = None  

# Define the number of epochs
epochs = 6
current = 1

# Training and Validation loop
while current <= epochs:
    print(f'\nEpoch {current} / {epochs}:')

    # Train model
    train_loss, _, train_acc = train()

    # Evaluate model
    valid_loss, _, valid_acc = evaluate()

    # Check if the current epoch's accuracy is the best so far
    if valid_acc > best_accuracy:
        best_accuracy = valid_acc
        best_model_state = model.state_dict()

    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')
    print(f'\n\nTraining Accuracy: {train_acc:.3f}')
    print(f'Validation Accuracy: {valid_acc:.3f}')

    current = current + 1

# Save the model with the best accuracy
if best_model_state is not None:
    torch.save(best_model_state, 'bert_cnn_model.pth')

# Get predictions for test data
gc.collect()
torch.cuda.empty_cache()


cuda

Epoch 1 / 5:
Batch 1012/1012 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.05, accuracy=0.875

Evaluating...
Batch 113/113 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.04, accuracy=1.05

Training Loss: 0.046
Validation Loss: 0.037


Training Accuracy: 0.878
Validation Accuracy: 0.918

Epoch 2 / 5:
Batch 1012/1012 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.03, accuracy=1.05

Evaluating...
Batch 113/113 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.05, accuracy=0.875

Training Loss: 0.030
Validation Loss: 0.047


Training Accuracy: 0.943
Validation Accuracy: 0.918

Epoch 3 / 5:
Batch 1012/1012 |██████████████████████████████████████████████████████████████

In [25]:
from sklearn.metrics import classification_report, accuracy_score

# Load the best model for evaluation
if best_model_state is not None:
    model.load_state_dict(best_model_state)

# Evaluate the model
model.eval()
with torch.no_grad():
    preds = model(val_texts.to(device), val_masks.to(device))
    preds = preds.cpu().numpy()

print("Performance:")
# Model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report:')
print(classification_report(val_labels.cpu(), preds))

print("Accuracy:", accuracy_score(val_labels.cpu(), preds))


Performance:
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       699
           1       0.86      0.75      0.80       201

    accuracy                           0.92       900
   macro avg       0.90      0.86      0.88       900
weighted avg       0.92      0.92      0.92       900

Accuracy: 0.9177777777777778


In [34]:
test_df = pd.read_csv("/kaggle/input/checkworthiness/checkworthiness_leaderboard.csv")

tokenized_texts = tokenizer(test_df['Text'].tolist(), padding=True, truncation=True, return_tensors="pt")

test_texts, test_masks = tokenized_texts['input_ids'], tokenized_texts['attention_mask']
test_texts = test_texts.to(device)
test_masks = test_masks.to(device)

ids = df['ID'].tolist()

# Load the best model for evaluation
if best_model_state is not None:
    model.load_state_dict(best_model_state)

with torch.no_grad():
    preds = model(test_texts, test_masks)
    preds = preds.detach().cpu().numpy()
preds = np.argmax(preds, axis=1)

print(preds)
# Convert predictions to labels
labels = []
for i in range(len(preds)):
    if preds[i] > 0.5:
        labels.append("Yes")
    else:
        labels.append("No")
# labels = ['Yes' if pred > 0.5 else 'No' for pred in preds]

print(len(labels))

print(labels)
print(len(ids))

# Ensure that both arrays have the same length
min_len = min(len(ids), len(labels))
ids = ids[:min_len]
labels = labels[:min_len]




# Create a DataFrame with IDs and predictions
result_df = pd.DataFrame({'ID': ids, 'Category': labels})

# Merge with the test_df on ID column
# final_result_df = pd.merge(test_df, result_df, on='ID')

# Print the final DataFrame
print(result_df)


[0 0 0 ... 0 0 0]
1467
['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', '

In [36]:
result_df.to_csv("/kaggle/working/submission.csv",index=False)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/kaggle/input/checkworthiness/checkworthiness_labeled.csv")
df_cleaned = df.dropna()  # Remove rows with any missing values
df = df_cleaned

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input texts
tokenized_texts = tokenizer(df['Text'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Convert labels to tensor
label_map = {'Yes': 1, 'No': 0}  # Define mapping for categories to integers
labels = torch.tensor(df['Category'].map(label_map).tolist())

class BertCNNClassifier(nn.Module):
    def __init__(self, num_classes, dropout_prob=0.1, bert_model_name='bert-base-uncased', cnn_out_channels=128, cnn_kernel_sizes=(2, 3, 4), dropout_p=0.1):
        super(BertCNNClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # CNN layers with batch normalization and dropout
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=self.bert.config.hidden_size, out_channels=cnn_out_channels, kernel_size=k),
                nn.BatchNorm1d(cnn_out_channels),
                nn.ReLU(),
                nn.Dropout(dropout_p)
            )
            for k in cnn_kernel_sizes
        ])
        
        # Fully connected layer with dropout
        self.fc = nn.Sequential(
            nn.Linear(len(cnn_kernel_sizes) * cnn_out_channels, num_classes),
            nn.Dropout(dropout_prob)
        )
        
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]  # Output of BERT last layer
        
        # Permute to have the shape (batch_size, hidden_size, sequence_length)
        bert_output = bert_output.permute(0, 2, 1)
        
        # Apply CNNs and pool over time
        conv_outputs = [conv(bert_output) for conv in self.convs]
        pooled_outputs = [torch.max(conv_output, dim=2)[0] for conv_output in conv_outputs]
        
        # Concatenate pooled outputs
        cat_output = torch.cat(pooled_outputs, dim=1)
        
        # Fully connected layer
        output = self.fc(cat_output)
        
        return output

# Initialize the model and move it to GPU if available
num_classes = 2  # Assuming binary classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertCNNClassifier(num_classes=num_classes).to(device)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(tokenized_texts['input_ids'], labels, test_size=0.1, random_state=42)
train_masks, val_masks, _, _ = train_test_split(tokenized_texts['attention_mask'], tokenized_texts['attention_mask'], test_size=0.1, random_state=42)

# Convert to PyTorch tensors and move to GPU
train_texts = train_texts.to(device)
val_texts = val_texts.to(device)
train_masks = train_masks.to(device)
val_masks = val_masks.to(device)
train_labels = train_labels.to(device)
val_labels = val_labels.to(device)

# DataLoaders
batch_size = 8
train_dataset = TensorDataset(train_texts, train_masks, train_labels)
val_dataset = TensorDataset(val_texts, val_masks, val_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Loss function and optimizer with weight decay
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    tqdm_train_loader = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for batch in tqdm_train_loader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]  # Move data to GPU
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        tqdm_train_loader.set_postfix({'loss': running_loss / len(tqdm_train_loader)})
    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}")

    # Validation loop
    model.eval()
    val_running_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]  # Move data to GPU
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            val_running_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)
        val_accuracy = val_correct / val_total
        val_epoch_loss = val_running_loss / len(val_loader)
        print(f"Validation Loss: {val_epoch_loss}, Accuracy: {val_accuracy}")

# Testing
# Assuming you have a separate test set, follow a similar procedure as the validation loop
# Evaluate the model on the test set using accuracy or other appropriate metrics


In [None]:
test_df = pd.read_csv("/kaggle/input/checkworthiness/checkworthiness_leaderboard.csv")

tokenized_texts = tokenizer(test_df['Text'].tolist(), padding=True, truncation=True, return_tensors="pt")

test_texts, test_masks = tokenized_texts['input_ids'], tokenized_texts['attention_mask']
test_texts = test_texts.to(device)
test_masks = test_mask.to(device)



# DataLoaders
batch_size = 8
train_dataset = TensorDataset(train_texts, train_masks, train_labels)
val_dataset = TensorDataset(val_texts, val_masks, val_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
