<!-- # This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session -->

# ***AI TEXT DETECTION*** 

## Let us load and visualize the dataset 

In [None]:
import pandas as pd

dataset = pd.read_csv("../input/llm-detect-ai-generated-text/train_essays.csv")

In [None]:
dataset.head() #here 0 ---> human and 1--> generated 

In [None]:
#check the shape of the dataset for the number of entries
dataset.shape 

In [None]:
#We have 1378 entries 

#We have no need for the 'id' and the 'prompt_id' columns --> remove them
del dataset['id'], dataset['prompt_id']

In [None]:
# Now that we know the no. of entries, let's see the distribution of the data 
import matplotlib.pyplot as plt

dataset['generated'].value_counts(ascending=True).plot.bar()
plt.title("Data distribution")
plt.show()

Here we can see that human written data is much more than the AI-generated text, 
that means that the dataset is imbalanced, 
so we need the external data that we downloaded earlier

In [None]:
# Let's load the external dataset that we have downloaded
external_dataset = pd.read_csv("../input/ai-text-detection-dataset/external_dataset.csv")
external_dataset.head()

In [None]:
#Let's check the number of entries in the external dataset 
external_dataset.shape

In [None]:
#Let's combine both the datasets 
complete_df = pd.concat([dataset,external_dataset])

In [None]:
complete_df

In [None]:
#Let's remove the duplicate entries(if any) and check the number of entries
complete_df.drop_duplicates(subset=['text'], inplace=True)
complete_df.reset_index(drop=True, inplace=True)
complete_df.shape

In [None]:
#Now, let us see the distribution of the new data obtained 
complete_df['generated'].value_counts(ascending=True).plot.bar()
plt.title("Data distribution")
plt.show()

Even now the data is somewhat imbalanced, but this is ok, 
we have double the human written text than the AI generated text.

In [None]:
#Let's add a new column to check the words per para for each entry
complete_df['Words per para'] = complete_df['text'].str.split().apply(len)
complete_df.head()

In [None]:
#Let us visualize this newly generated column using a box-plot 
complete_df.boxplot(figsize=(7,7), column='Words per para', by='generated',showfliers=False, color='blue')

From the above boxplot, it can be clearly seen that the average 'Words per para' for both the categories are very near. Also, the maximum percentage of the data lies between 300-500 words.

In [None]:
#Let us visualize this newly generated column using another box-plot 
complete_df.boxplot(figsize=(10,15), column='Words per para',showfliers=True, color='blue')

Let us remove the outliers, ie the entries with more than 800 words per para to be removed

In [None]:
complete_df = complete_df[complete_df['Words per para'] <= 800]

In [None]:
#Now, let us see the distribution of the new data obtained 
complete_df['generated'].value_counts(ascending=True).plot.bar()
plt.title("Data distribution")
plt.show()

In [None]:
#Now since there is no need for the 'words per para' column, so let's drop this column
del complete_df['Words per para']

In [None]:
# Rename the column 'generated' to 'label'
complete_df = complete_df.rename(columns={'generated': 'label'})

## Preprocessing of the data(to feed to the model) 

### Converting the above datasets into a hugging face dataset object 

In [None]:
#import the required libraries
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

#split the data into training and validation dataset
train_dataset, valid_dataset = train_test_split(complete_df, test_size = 0.30, random_state = 10) 

In [None]:
train_dataset.shape, valid_dataset.shape 

In [None]:
#Convert the train_dataset and the valid_dataset into Dataset objects
train_data = Dataset.from_pandas(train_dataset)
valid_data = Dataset.from_pandas(valid_dataset)

In [None]:
#Create a DatasetDict
comp_data= DatasetDict({"train": train_data, "valid": valid_data})

print(comp_data)

### Conversion of the text to tokens 

In [None]:
# We will be using a sub-word tokenizer to tokenize the text that we have using the tokenizer for the roberta-base
from transformers import AutoTokenizer
# model_checkpoint = "roberta-base"
model_checkpoint = '../input/roberta-pretrained-model-with-classification-head/roberta_base_without_classification_head/content/roberta_base'

#load the tokenizer from the model checkpoint
text_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
#Let us check the maximum content size of this tokenizer(the maximum no. of tokens per entry)
text_tokenizer.model_max_length

In [None]:
#Let's us check an entry using this 
complete_df['text'][0] #the number of words in this entry were 584

In [None]:
encoded_text = text_tokenizer(complete_df['text'][0])

In [None]:
len(encoded_text['input_ids']) #the number of tokens has exceeded the number of maximum tokens --> this means we will have to truncate those tokens

In [None]:
#define a function to apply tokenization on all the entries
def tokenize_text(input_entry):
    #we also apply padding, in case the tokens remain less than 512
    return text_tokenizer(input_entry['text'], padding=True, truncation = True)

In [None]:
#Apply to all the dataset as a single batch since the batch_size is given as none
comp_tokenized_data = comp_data.map(tokenize_text , batched = True , batch_size = None)

In [None]:
comp_tokenized_data

In [None]:
comp_tokenized_data['train'].column_names

In [None]:
text_tokenizer.model_input_names

## Loading and training the model

### Loading a pre-trained model

In [None]:
from transformers import AutoModelForSequenceClassification
#We are using AutoModelForSequenceClassification --> because we want to do classification and using this,
#it automatically adds a classification head to the pretrained-model
#The classification has random weights assigned to it

#Check for GPU, if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Define the number of labels for the classification head(here we have 2 labels)
num_labels = 2

#load the model and chain the model to gpu
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = num_labels).to(device)

In [None]:
#define tne metrics function to compute the metrics
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels,preds)
    return {"accuracy":acc, "f1":f1}

### Defining the training arguments and the training object

In [None]:
from transformers import TrainingArguments

#define the batch size
batch_size = 16

#define the logging steps so that we can define when to save the checkpoints
# logging_steps = (len(comp_tokenized_data['train'])) // batch_size

model_name = "LLM_AI_TEXT_DETECTOR_ROBERTA"

#define the training arguments
training_arguments = TrainingArguments(output_dir = model_name,
                                      num_train_epochs = 10,
                                      learning_rate = 1e-5,
                                      per_device_train_batch_size = batch_size,
                                      per_device_eval_batch_size = batch_size,
                                      weight_decay = 0.01,
                                      evaluation_strategy = "epoch",
                                      disable_tqdm = False,
                                      logging_steps = 4000,
                                      push_to_hub=False,
                                      report_to="none",
                                      log_level='error',
                                      save_strategy = 'no')


In [None]:
from transformers import Trainer

trainer = Trainer(model= model, args = training_arguments,
                  compute_metrics=compute_metrics,
                  train_dataset= comp_tokenized_data['train'],
                  eval_dataset = comp_tokenized_data['valid'],
                  tokenizer=text_tokenizer)

In [None]:
trainer.train()

### Doing some interference 

In [None]:
pred_output_valid = trainer.predict(comp_tokenized_data['valid'])

In [None]:
import numpy as np

y_pred_valid = np.argmax(pred_output_valid.predictions, axis = 1)
y_pred_valid

In [None]:
#Let us create a confusion matrix for the interferences

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

In [None]:
y_valid = comp_tokenized_data['valid']['label']

In [None]:
plot_confusion_matrix(y_pred_valid, y_valid, ['AI','Human'])

## Load the test dataset and make the predictions 

In [None]:
test_dataset = pd.read_csv("../input/llm-detect-ai-generated-text/test_essays.csv")

In [None]:
test_dataset

In [None]:
#remove the prompt_id column from the dataset 
del test_dataset['prompt_id']

In [None]:
test_dataset_tokenize = test_dataset.copy()
del test_dataset_tokenize['id']

#Convert the test_dataset into Dataset object
test_dataset_tokenize = Dataset.from_pandas(test_dataset_tokenize)

In [None]:
test_dataset_tokenize 

In [None]:
#Apply to all the dataset as a single batch since the batch_size is given as none
tokenized_test_data = test_dataset_tokenize.map(tokenize_text , batched = True , batch_size = None)

In [None]:
tokenized_test_data

In [None]:
pred_output_test = trainer.predict(tokenized_test_data)

In [None]:
pred_output_test.predictions

In [None]:
import torch
import torch.nn.functional as F

probabilities = F.softmax(torch.from_numpy(pred_output_test.predictions), dim=1)

In [None]:
#convert back to numpy 
probabilities = probabilities.numpy()

In [None]:
ai_generated_probabilities = probabilities[:, 1].tolist()
ai_generated_probabilities

In [None]:
#add the 'generated' column and delete the 'text' column
test_dataset['generated'] = ai_generated_probabilities
del test_dataset['text']

In [None]:
test_dataset

In [None]:
#create the submission file
test_dataset.to_csv('submission.csv', index=False)