### Cleaning the merged dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection/data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,texts
0,0,1,"Ah, so I have been told ;)"
1,1,0,just honest
2,2,0,i'm trying to get into med school
3,3,1,The rhythm of your heart is music to my ears.
4,4,0,"hi, not bad, how about yours?"


In [4]:
# removing the 'Unnamed: 0' column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
# checking for missing values and removing them if any
df.isnull().sum()

label    0
texts    0
dtype: int64

In [6]:
len(df)

5093

In [7]:
# removing duplicate rows
df.drop_duplicates(inplace=True)

In [8]:
# text normalization
df['texts'] = df['texts'].str.lower().str.strip()

In [9]:
df.head()

Unnamed: 0,label,texts
0,1,"ah, so i have been told ;)"
1,0,just honest
2,0,i'm trying to get into med school
3,1,the rhythm of your heart is music to my ears.
4,0,"hi, not bad, how about yours?"


In [10]:
len(df)

3247

In [11]:
pip install transformers torch



In [12]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [13]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and test sets (80% training, 20% test)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_data.shape, test_data.shape

((2597, 2), (650, 2))

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(train_data['texts'].values), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(test_data['texts'].values), truncation=True, padding=True, max_length=256)

In [15]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert labels to integers
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['label'])
test_labels = label_encoder.transform(test_data['label'])

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [16]:
!pip install accelerate -U



In [17]:
from sklearn.metrics import accuracy_score
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

In [None]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection',  # Ensure this is mentioned only once
    output_dir='/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection',
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection')
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4233,0.418497,0.832308
2,0.3331,0.503615,0.821538
3,0.2173,0.593997,0.832308
4,0.1391,0.819027,0.843077
5,0.0762,0.812765,0.846154
6,0.0333,1.004047,0.849231
7,0.0315,1.046281,0.838462
8,0.0212,1.131542,0.847692
9,0.0124,1.130602,0.850769
10,0.0363,1.145568,0.850769


('/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/flirting detection/text_detection/added_tokens.json')