In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd

# Load your dataset (replace 'your_dataset.csv' with your dataset file)
df = pd.read_csv('concatenated.csv')
df.rename(columns={'text': 'text'}, inplace=True)

In [2]:
df

Unnamed: 0,id,prompt_id,text,generated,model,kaggle_repo
0,d429f032,0,Advantages of Limiting Car Usage \n\nLimiting ...,1,gpt-3.5-turbo,1
1,1ce279be,0,Advantages of Limiting Car Usage\n\nLimiting c...,1,gpt-3.5-turbo,1
2,c9595213,0,Limiting car usage has numerous advantages tha...,1,gpt-3.5-turbo,1
3,f2266d87,0,The passages provided discuss the advantages o...,1,gpt-3.5-turbo,1
4,eeace4bd,0,Title: The Advantages of Limiting Car Usage\n\...,1,gpt-3.5-turbo,1
...,...,...,...,...,...,...
54686,df8bf6e6-eca4-417c-bce2-b3b90d782d93,-1,I believe using cellphones in class for educat...,0,human,9
54687,502c11ac-7d0a-4b11-8572-5f7317364bc7,-1,"Working alone, students do not have to argue w...",0,human,9
54688,993f33f3-0cfa-4b5a-b382-bdf7f82e2666,-1,"""A problem is a chance for you to do your best...",0,human,9
54689,0f9f0196-4828-4122-b0cb-aea2373066fb,-1,Many people disagree with Albert Schweitzer's ...,0,human,9


In [3]:
df["model"][6002]

'gpt-3.5-turbo'

In [4]:
unique_models =  list(df["model"].unique())

In [5]:
unique_models

['gpt-3.5-turbo',
 'gpt-4',
 'unknown',
 'falcon-180b',
 'llama-70b',
 'llama-falcon',
 'palm-2',
 'claude',
 'human',
 'mistral']

In [6]:
def create_label(category): #0 spam 1 ham
    return unique_models.index(category)

df['label'] = df['model'].apply(create_label)

In [7]:
df

Unnamed: 0,id,prompt_id,text,generated,model,kaggle_repo,label
0,d429f032,0,Advantages of Limiting Car Usage \n\nLimiting ...,1,gpt-3.5-turbo,1,0
1,1ce279be,0,Advantages of Limiting Car Usage\n\nLimiting c...,1,gpt-3.5-turbo,1,0
2,c9595213,0,Limiting car usage has numerous advantages tha...,1,gpt-3.5-turbo,1,0
3,f2266d87,0,The passages provided discuss the advantages o...,1,gpt-3.5-turbo,1,0
4,eeace4bd,0,Title: The Advantages of Limiting Car Usage\n\...,1,gpt-3.5-turbo,1,0
...,...,...,...,...,...,...,...
54686,df8bf6e6-eca4-417c-bce2-b3b90d782d93,-1,I believe using cellphones in class for educat...,0,human,9,8
54687,502c11ac-7d0a-4b11-8572-5f7317364bc7,-1,"Working alone, students do not have to argue w...",0,human,9,8
54688,993f33f3-0cfa-4b5a-b382-bdf7f82e2666,-1,"""A problem is a chance for you to do your best...",0,human,9,8
54689,0f9f0196-4828-4122-b0cb-aea2373066fb,-1,Many people disagree with Albert Schweitzer's ...,0,human,9,8


In [8]:
df["model"][6002]

'gpt-3.5-turbo'

In [9]:
df["label"][6002]

0

In [10]:
df["label"].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
number_of_labels = len(df["model"].unique())

In [12]:
number_of_labels

10

In [13]:
import torch
torch.cuda.empty_cache()

In [14]:
df = df.dropna()

In [15]:
train_df = df.sample(frac=0.5, random_state=25)  # 80% for training
test_df = df.drop(train_df.index)   
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

In [16]:
# Load tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
df['text'][0]

'Advantages of Limiting Car Usage \n\nLimiting car usage can have a number of advantages for individuals and the community as a whole. The passage set provides evidence of this trend happening in different parts of the world, including Germany, France, and Colombia. In these places, efforts have been made to reduce the dependence on cars and promote alternative modes of transportation. Limiting car usage can lead to a cleaner environment, healthier individuals, and better community planning.\n\nOne of the main advantages of limiting car usage is the reduction of greenhouse gas emissions and air pollution. According to the passages, passenger cars are responsible for a significant percentage of greenhouse gas emissions in Europe and some car-intensive areas in the United States. By reducing the number of cars on the road, we can significantly decrease the amount of pollution being released into the environment. This is particularly important in densely populated areas like cities, where

In [18]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/27346 [00:00<?, ? examples/s]

Map:   0%|          | 0/27345 [00:00<?, ? examples/s]

In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    predictions = logits.argmax(axis=-1)

    # Compute the number of correct predictions
    correct_predictions = (predictions == labels).sum().item()

    # Calculate accuracy
    accuracy = correct_predictions / len(labels)

    return {"accuracy": accuracy}



In [20]:
# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=number_of_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    fp16=True  # Enable mixed precision training
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics  # Add this line
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.107,0.175911,0.953886


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3419, training_loss=0.2311067327347092, metrics={'train_runtime': 1331.8662, 'train_samples_per_second': 20.532, 'train_steps_per_second': 2.567, 'total_flos': 7195551730397184.0, 'train_loss': 0.2311067327347092, 'epoch': 1.0})

In [22]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.17591069638729095, 'eval_accuracy': 0.9538855366611813, 'eval_runtime': 292.3087, 'eval_samples_per_second': 93.548, 'eval_steps_per_second': 11.697, 'epoch': 1.0}


In [23]:
model_path = "./trained_model_DAIGT"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./trained_model_DAIGT/tokenizer_config.json',
 './trained_model_DAIGT/special_tokens_map.json',
 './trained_model_DAIGT/vocab.txt',
 './trained_model_DAIGT/added_tokens.json',
 './trained_model_DAIGT/tokenizer.json')

In [24]:
from transformers import pipeline

# Assuming it's a text classification task. Replace with the task your model is trained for.
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)


In [25]:
test_df = test_df[0:1000]

In [26]:
from tqdm import tqdm
count = 0 
  
for row in tqdm(test_df.itertuples(), total=len(test_df)):
    message = row.text[0:2300]
    label = row.label
    result = classifier(message)
    if int(result[0]['label'][-1]) == int(label):
        count += 1

print(count / len(test_df))

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:07<00:00,  5.33it/s]

0.939





In [27]:
result = classifier("Hii how are you I've just wrote to you to get my id")
print(result)

[{'label': 'LABEL_8', 'score': 0.8691005706787109}]


In [28]:
df

Unnamed: 0,id,prompt_id,text,generated,model,kaggle_repo,label
0,d429f032,0,Advantages of Limiting Car Usage \n\nLimiting ...,1,gpt-3.5-turbo,1,0
1,1ce279be,0,Advantages of Limiting Car Usage\n\nLimiting c...,1,gpt-3.5-turbo,1,0
2,c9595213,0,Limiting car usage has numerous advantages tha...,1,gpt-3.5-turbo,1,0
3,f2266d87,0,The passages provided discuss the advantages o...,1,gpt-3.5-turbo,1,0
4,eeace4bd,0,Title: The Advantages of Limiting Car Usage\n\...,1,gpt-3.5-turbo,1,0
...,...,...,...,...,...,...,...
54686,df8bf6e6-eca4-417c-bce2-b3b90d782d93,-1,I believe using cellphones in class for educat...,0,human,9,8
54687,502c11ac-7d0a-4b11-8572-5f7317364bc7,-1,"Working alone, students do not have to argue w...",0,human,9,8
54688,993f33f3-0cfa-4b5a-b382-bdf7f82e2666,-1,"""A problem is a chance for you to do your best...",0,human,9,8
54689,0f9f0196-4828-4122-b0cb-aea2373066fb,-1,Many people disagree with Albert Schweitzer's ...,0,human,9,8
