# FlanT5 Seq2Seq for classification (Failed :< )

## Dependencies

In [None]:
from tqdm.notebook import tqdm
from IPython import display

import numpy as np
import pandas as pd
import math

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
pip install transformers datasets accelerate peft

In [None]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration, DataCollatorForSeq2Seq, AdamW

from peft import get_peft_model, LoraConfig, TaskType

## Data

In [None]:
lora_config = LoraConfig(
    task_type= TaskType.SEQ_2_SEQ_LM,  # Sequence-to-sequence language modeling
    inference_mode = False,  # We're in training mode
    r= 4,
    target_modules = ['q','v'],
    bias = 'none',
    lora_alpha= 4,
    lora_dropout= 0.1
)

In [None]:
model_name = 'google/flan-t5-small'
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model = get_peft_model(model, lora_config)

In [None]:
train = pd.read_csv('train.csv')
train = train.dropna()

train_data = train.iloc[:int(len(train)*(9/10))]
val_data = train.iloc[int(len(train)*(9/10)):]

In [None]:
id_to_text = {
    1: "one", 2: "two", 3: "three", 4: "four", 5: "five",
    6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten"
}

text_to_id = {
    "one":1, "two":2 , "three":3 , "four":4 , "five":5,
    "six":6 , "seven":7, "eight":8, "nine":9, "ten":10
}


def id2label(id):
  return id_to_text[id]

def label2id(label):
  return text_to_id[label]


# Prompt = 'How you rate this movie review?' + Review + label_to_text[label]

In [None]:
def review_prompt(text):
  prompt = 'How do your rate this movie review from integers between one to ten?  Review: '
  return prompt + text

In [None]:
train_data['Rating'] = train_data['Rating'].apply(id2label)
val_data['Rating'] = val_data['Rating'].apply(id2label)

In [None]:
train_data['Review'] = train_data['Review'].apply(review_prompt)
val_data['Review'] = val_data['Review'].apply(review_prompt)

In [None]:
train_data.head(3)

In [None]:
#Create a class for review data

class ReviewDataset(Dataset):
  def __init__(self, data, tokenizer, max_len):
    self.data = data
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.id_to_text = id_to_text

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    row = self.data.iloc[idx]
    review = row['Review']
    label = row['Rating']

    encoding_review = self.tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation = True
    )

    encoding_label = self.tokenizer.encode_plus(
        label,
        max_length = 3,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation = True
    )


    return {
        **encoding_review,
        'labels': encoding_label.input_ids
    }

In [None]:
max_len = 64
batch_size = 64

train_dataset = ReviewDataset(train_data, tokenizer, max_len)
val_dataset = ReviewDataset(val_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## train

In [None]:
# Hyper Parameters

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 64
learning_rate = 1e-4
optimizer = torch.optim.AdamW(model.parameters(), lr= learning_rate)
num_epochs = 8

In [None]:
Best_model = None
Loss = 0
Accuracy = 0
R2 = 0

In [None]:
from sklearn.metrics import r2_score, accuracy_score

# Map text labels back to their integer representations
text_to_label = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

In [None]:
def valid_loop():
  model.eval()
  total_val_loss = 0
  all_preds = []
  all_labels = []

  with torch.no_grad():
      for batch in tqdm(val_loader):

          outputs = model(input_ids=batch['input_ids'].squeeze(1), attention_mask=batch['attention_mask'].squeeze(1), labels=batch['labels'].squeeze(1))
          val_loss = outputs.loss
          total_val_loss += val_loss.item()

          # Generate predictions (decoding the output)
          generated_tokens = model.generate(input_ids=batch['input_ids'].squeeze(1), attention_mask=batch['attention_mask'].squeeze(1))

          # Decode the generated tokens and the actual labels
          decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
          decoded_labels = tokenizer.batch_decode(batch['labels'].squeeze(1), skip_special_tokens=True)

          # Store predictions and labels for accuracy calculation
          all_preds.extend(decoded_preds)
          all_labels.extend(decoded_labels)

      avg_val_loss = total_val_loss / len(val_loader)
      accuracy = accuracy_score(all_labels, all_preds)

      int_preds = [text_to_label.get(pred, 0) for pred in all_preds]
      int_labels = [text_to_label.get(label, 0) for label in all_labels]
      r2 = r2_score(int_labels, int_preds)

      return {
          'Loss': avg_val_loss,
          'Accuracy': accuracy,
          'R2': r2
      }

In [None]:
model.train()

import tqdm
from tqdm import tqdm

for epoch in range(2):
  total_loss = 0
  batch_num = 0

  for batch in tqdm(train_loader):
    batch_num += 1
    outputs = model(input_ids=batch['input_ids'].squeeze(1),
                   attention_mask=batch['attention_mask'].squeeze(1),
                   labels=batch['labels'].squeeze(1))
    loss = outputs.loss

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss.item()
    if batch_num%100 == 0:
      print(loss.item())


  avg_loss = total_loss/len(train_loader)
  print('avg_loss is: ', avg_loss)

  valid_results = valid_loop()
  valid_loss = valid_results['Loss']
  valid_accuracy = valid_results['Accuracy']
  valid_r2 = valid_results['R2']

  if valid_r2 > R2:
    Best_model = model
    Loss = valid_loss
    R2 = valid_r2
    Accuracy = valid_accuracy
    print('R2: ', R2)
    print('Accuracy: ', Accuracy)
    print('Loss: ', Loss)

 44%|████▍     | 69/156 [10:51<13:41,  9.45s/it]


KeyboardInterrupt: 

In [None]:
print(R2)

In [None]:
 torch.save(model.state_dict(), 'Flan_review.pt')

In [None]:
valid_r2

-0.2688368096805218

# Eglish grading

## classics (worked :)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
path = save_path = '/content/drive/My Drive/nlp datasets/' + 'english grader.csv'
data = pd.read_csv(path)
data.head(3)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5


In [None]:
X = data['full_text']
y = data.drop(['text_id', 'full_text'], axis=1)

In [None]:
max_len = max(len(text) for text in X)
max_len

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import accuracy_score


# Step 1: Preprocessing with TF-IDF
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X = tfidf.fit_transform(X)

# Step 2: Split the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize Ridge regression model
ridge = Ridge()

# Step 4: Train multi-output regressor
model = MultiOutputRegressor(ridge)
model.fit(X_train, y_train)

# Step 5: Predict on test data
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score

metrics = [i for i in y]
f = 0
for metric in metrics:
  y_metric = list(y_test[metric])
  pred_metric = list(y_pred[:, f])
  f+= 1
  r2_metric = r2_score(y_metric, pred_metric)
  print(f'r2 for {metric} is {r2_metric}')

r2 for cohesion is 0.2931921418355595
r2 for syntax is 0.3184661680473183
r2 for vocabulary is 0.3091955566881247
r2 for phraseology is 0.34664712243375584
r2 for grammar is 0.30031660842011776
r2 for conventions is 0.30194042378922625


In [None]:
X = data['full_text']
y = data.drop(['text_id', 'full_text'], axis=1)

## Bart (Failed :<)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
path = save_path = '/content/drive/My Drive/nlp datasets/' + 'english grader.csv'
data = pd.read_csv(path)

Mounted at /content/drive


In [None]:
X = data['full_text']
y = data.drop(['text_id', 'full_text'], axis=1)

In [None]:
# Seq2Seq task where input is text and target, the sequence of grades

In [None]:
from tqdm.notebook import tqdm
from IPython import display

import numpy as np
import pandas as pd
import math

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

!pip install transformers datasets accelerate peft

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
from transformers import  BartTokenizerFast, BartForConditionalGeneration, AdamW

from peft import get_peft_model, LoraConfig, TaskType

In [None]:
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]



In [None]:
scores = {}
for i in y:
  scores[i] = list(y[i])
grade_texts = [[str(scores[score][i]) for score in scores] for i in range(len(scores['grammar']))]

In [None]:
grade_texts = [' '.join(grade) for grade in grade_texts]

In [None]:
texts = list(X)

In [None]:
tokenizer(grade_texts[0], max_length= 20, padding='max_length', truncation=True, return_tensors="pt")

{'input_ids': tensor([[  0, 246,   4, 245, 155,   4, 245, 155,   4, 288, 155,   4, 288, 204,
           4, 288, 155,   4, 288,   2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# Function to tokenize the input text and target grades
def preprocess_data(texts, grade_texts, tokenizer, max_input_length=256, max_target_length=20):
    inputs = tokenizer(texts, max_length=max_input_length, padding='max_length', truncation=True, return_tensors="pt")
    targets = tokenizer(grade_texts, max_length=max_target_length, padding='max_length', truncation=True, return_tensors="pt")
    return inputs, targets


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, grade_texts, test_size=0.15, random_state=42)

train_inputs, train_targets = preprocess_data(X_train, y_train, tokenizer)
val_inputs, val_targets = preprocess_data(X_val, y_val, tokenizer)


In [None]:
from transformers import BartForConditionalGeneration
import torch

# Load pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [None]:
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    learning_rate = 1e-4,
    output_dir='./results',          # Output directory
    num_train_epochs=2,              # Number of training epochs
    per_device_train_batch_size=16,   # Batch size for training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
)

# Prepare the datasets (PyTorch Dataset format)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Target labels for generation
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

# Prepare datasets for training
train_dataset = CustomDataset(train_inputs, train_targets)
val_dataset = CustomDataset(val_inputs, val_targets)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Target labels for generation


Step,Training Loss


Step,Training Loss


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=416, training_loss=0.6010048206035907, metrics={'train_runtime': 13930.3035, 'train_samples_per_second': 0.477, 'train_steps_per_second': 0.03, 'total_flos': 1013381993594880.0, 'train_loss': 0.6010048206035907, 'epoch': 2.0})

In [None]:
from sklearn.metrics import r2_score

# Function to decode the true grades (tokenizer ids) back into float numbers
def decode_grades(tokenizer, labels):
    decoded_grades = []
    for label_ids in labels['input_ids']:
        # Decode the label ids back to text (skip special tokens)
        label_text = tokenizer.decode(label_ids, skip_special_tokens=True)
        # Convert the label text back to a list of numbers
        grades = list(map(float, label_text.split()))
        decoded_grades.append(grades)
    return decoded_grades

# Function to generate predictions and calculate R² score
def calculate_r2_score(model, val_inputs, val_targets, tokenizer, device):
    model.eval()  # Set model to evaluation mode
    predictions = []
    true_grades = decode_grades(tokenizer, val_targets)  # Decode true grades from token ids
    true_grades = true_grades[:][:-1]

    # Iterate over the validation data
    for i in range(len(val_inputs['input_ids'])):
        input_ids = val_inputs['input_ids'][i].unsqueeze(0).to(device)
        attention_mask = val_inputs['attention_mask'][i].unsqueeze(0).to(device)

        # Generate prediction
        outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=20, num_beams=4)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Convert predicted grades from string back to list of floats
        pred_grades = list(map(float, pred.split()))

        # Append the predicted grades
        predictions.append(pred_grades)
    return true_grades, predictions


# Call the function to calculate the R² score on the validation set
true, pred = calculate_r2_score(model, val_inputs, val_targets, tokenizer, device)

In [None]:
true_first = [i[5] for i in true]
pred_first = [i[5] for i in pred]

r2_score(true_first, pred_first[:-1])

-1.7936721171331507

## Bert-small (worked better :)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
path = save_path = '/content/drive/My Drive/nlp datasets/' + 'english grader.csv'
data = pd.read_csv(path)

Mounted at /content/drive


In [None]:
texts = data['full_text']
y = data.drop(['text_id', 'full_text'], axis=1)

In [None]:
scores = {}
for i in y:
  scores[i] = list(y[i])
grades = [[float(scores[score][i]) for score in scores] for i in range(len(scores['grammar']))]

In [None]:
texts = list(texts)

In [None]:
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import numpy as np


# Load BERT-small tokenizer
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-small')


# Split data into training and test sets
texts_train, texts_test, grades_train, grades_test = train_test_split(texts, grades, test_size=0.2, random_state=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]



In [None]:
from torch.utils.data import Dataset, DataLoader

# Custom dataset for text and grades
class TextGradeDataset(Dataset):
    def __init__(self, texts, grades, tokenizer, max_length=512):
        self.texts = texts
        self.grades = grades
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        grade = self.grades[idx]

        # Tokenize text
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'grades': torch.tensor(grade, dtype=torch.float)
        }

# Instantiate the dataset
train_dataset = TextGradeDataset(texts_train, grades_train, tokenizer)
test_dataset = TextGradeDataset(texts_test, grades_test, tokenizer)

In [None]:
# Create a DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import torch.nn as nn
from transformers import BertModel

# Define the custom grading model based on BERT-small
class GradingModel(nn.Module):
    def __init__(self):
        super(GradingModel, self).__init__()
        self.bert = BertModel.from_pretrained('prajjwal1/bert-small')
        self.fc = nn.Linear(512, 6)  # BERT-small hidden size is 512, 6 outputs for grades

    def forward(self, input_ids, attention_mask):
        # Get outputs from BERT model
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the [CLS] token's embedding (output from the first token in the sequence)
        cls_output = output.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

        # Pass [CLS] token's output through the linear layer
        grades = self.fc(cls_output)

        return grades

model = GradingModel()

# Define optimizer and custom loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Custom weighted MSE loss function
# For example, if "syntax" and "phraseology" are harder, we can assign higher weights to their errors
task_weights = torch.tensor([1.0, 1.5, 1.0, 1.5, 1.0, 1.0])  # Higher weights for harder tasks (syntax, phraseology)

def weighted_mse_loss(predictions, targets):
    loss = torch.mean(task_weights * (predictions - targets) ** 2)
    return loss

In [None]:
from sklearn.metrics import r2_score

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop with batches
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        grades = batch['grades'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask)

        # Calculate loss
        loss = weighted_mse_loss(outputs, grades)
        total_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

    # Evaluation on test data
    model.eval()
    predictions_list = []
    actual_list = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            grades = batch['grades'].to(device)

            # Forward pass
            predictions = model(input_ids, attention_mask)

            # Move predictions and grades to CPU
            predictions = predictions.cpu().numpy()
            grades = grades.cpu().numpy()

            # Store results for evaluation
            predictions_list.append(predictions)
            actual_list.append(grades)

    # Calculate R² score for each criterion
    predictions_all = np.vstack(predictions_list)
    actual_all = np.vstack(actual_list)
    r2_scores = [r2_score(actual_all[:, i], predictions_all[:, i]) for i in range(6)]
    print(f"R² Scores: {r2_scores}")

# Duplicated

## Sentence-Bert

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
path = save_path = '/content/drive/My Drive/nlp datasets/' + 'quora_duplicated.csv'
data = pd.read_csv(path)

Mounted at /content/drive


In [None]:
q1 = list(data['question1'])[:14000]
q2 = list(data['question2'])[:14000]
labels = list(data['is_duplicate'])[:14000]

In [None]:
labels_0 = [i==0 for i in labels]
sum(labels_0)/len(labels)

0.627

In [None]:
!pip install transformers datasets sentence-transformers torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'sentence-transformers/all-MiniLM-L6-v2'  # Choose your preferred sentence-bert model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np


# Split data into training and test sets
q1_train, q1_test, q2_train, q2_test, labels_train, labels_test = train_test_split(q1, q2, labels, test_size=0.2, random_state=42)

In [None]:
from torch.utils.data import Dataset, DataLoader

# Custom dataset for questions and labels
class QuestionsDataset(Dataset):
    def __init__(self, q1, q2, labels, tokenizer, max_length=70):
        self.q1 = q1
        self.q2 = q2
        self.label = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.q1)

    def __getitem__(self, idx):
        q1 = self.q1[idx]
        q2 = self.q2[idx]
        label = self.label[idx]

        # Tokenize text
        encoding = self.tokenizer(
            q1,
            q2,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Instantiate the dataset
train_dataset = QuestionsDataset(q1_train, q2_train, labels_train, tokenizer)
test_dataset = QuestionsDataset(q1_test, q2_test, labels_test, tokenizer)

In [None]:
# Create a DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import torch
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
from sklearn.metrics import accuracy_score
import tqdm
from tqdm import tqdm
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop with batches
epochs = 1

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask, labels=labels)

        # Calculate loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

100%|██████████| 175/175 [17:46<00:00,  6.09s/it]

Epoch 1/1, Loss: 0.5392871872016362





In [None]:
val_loader = test_loader

In [None]:
def eval():
    # Validation loop
    model.eval()
    total_val_loss = 0
    total_val_correct = 0
    total_val = 0

    for batch in tqdm(val_loader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

            _, val_preds = torch.max(outputs.logits, dim=1)
            total_val_correct += (val_preds == labels).sum().item()
            total_val += labels.size(0)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = total_val_correct / total_val
    print(f"Val Epoch: {epoch+1}, Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

In [None]:
eval() # for 14000

100%|██████████| 44/44 [01:26<00:00,  1.96s/it]

Val Epoch: 1, Loss: 0.4326, Accuracy: 0.7879





In [None]:
eval() # third

100%|██████████| 44/44 [00:45<00:00,  1.03s/it]

Val Epoch: 1, Loss: 0.4136, Accuracy: 0.8171





# Spooky authors

## classics (worked, 0.68 for rf and svm takes way longer but reached 0.79)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
path = save_path = '/content/drive/My Drive/nlp datasets/' + 'author identification.csv'
data = pd.read_csv(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
texts = list(data['text'])
labels = list(data['author'])

In [None]:
lens = [len(text.split(' ')) for text in texts]
print('mean: ', sum(lens)/len(lens))
print('max: ', max(lens))
print('min: ', min(lens))

mean:  26.730476530977068
max:  861
min:  2


In [None]:
authors = data['author'].unique()
authors

array(['EAP', 'HPL', 'MWS'], dtype=object)

In [None]:
label_dict = {authors[i]: i for i in range(len(authors))}
label_dict

{'EAP': 0, 'HPL': 1, 'MWS': 2}

In [None]:
labels = [label_dict[i] for i in labels]

In [None]:
labels0 = [i==1 for i in labels]
print(sum(labels0)/len(labels))

0.2878083661065427


Approach:
* Extract tf-idf features
* Extract additional features such as:
portion of different grammatical temrs
& mean and variance of sentence lenghts
* Train an svm using the feature vectors

In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features= 5000)  # Adjust max_features as needed

tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
# Install spaCy and download the language model if needed
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import numpy as np

nlp = spacy.load('en_core_web_sm')
pos_tags = ['NOUN', 'VERB', 'ADJ', 'ADV']  # Add more tags as needed

def get_pos_proportion(text, pos_tags):
    doc = nlp(text)
    pos_counts = {tag: 0 for tag in pos_tags}
    total_tokens = len(doc)

    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1

    proportions = [pos_counts[tag] / total_tokens for tag in pos_tags]
    return proportions

# POS features for train and test sets
pos_train = np.array([get_pos_proportion(text, pos_tags) for text in X_train])
pos_test = np.array([get_pos_proportion(text, pos_tags) for text in X_test])

In [None]:
import re
import numpy as np

def get_sentence_length_features(text):
    sentences = re.split(r'[.!?]', text)
    sentence_lengths = [len(sentence.split()) for sentence in sentences if sentence.strip()]

    mean_length = np.mean(sentence_lengths) if len(sentence_lengths) > 0 else 0
    var_length = np.var(sentence_lengths) if len(sentence_lengths) > 0 else 0

    return [mean_length, var_length]

# Sentence length features for train and test sets
length_train = np.array([get_sentence_length_features(text) for text in X_train])
length_test = np.array([get_sentence_length_features(text) for text in X_test])

In [None]:
from scipy.sparse import hstack

combined_train = hstack([tfidf_train, pos_train, length_train])

# For testing
combined_test = hstack([tfidf_test, pos_test, length_test])

In [None]:
combined_train

<15663x5006 sparse matrix of type '<class 'numpy.float64'>'
	with 372705 stored elements in COOrdinate format>

In [None]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', class_weight='balanced')
svm_classifier.fit(combined_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred =  svm_classifier.predict(combined_test)
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  0.7997957099080695


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators= 200, random_state=42)
rf.fit(combined_train, y_train)

In [None]:
y_pred = rf.predict(combined_test)

from sklearn.metrics import accuracy_score
print('Accuracy: ', accuracy_score(y_pred, y_test))

# Movie review

## rf: 0.65, rf + sentiment_score -> 0.72

In [None]:
!pip install transformers
!pip install sentence_transformers

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

Mounted at /content/drive


In [None]:
path = save_path = '/content/drive/My Drive/nlp datasets/' + 'movie review.csv'
data = pd.read_csv(path)

In [None]:
# Load sentence transformer for embeddings
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load pre-trained sentiment analysis model
sentiment_model = pipeline('sentiment-analysis')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
def summarizer(review):
  sentences = str.split(review, '.')
  first = sentences[:2]
  last = sentences[-2:]
  summary = first + last
  summary = '.'.join(summary)
  return summary

print(len(data['Review']))
data = data.dropna()
print(len(data['Review']))
data['summarized'] = data['Review'].apply(summarizer)
data.head(1)

11060
11054


Unnamed: 0,Movie_ID,Review,Rating,summarized
0,tt0108052,This is a film that is made by a filmmaker at ...,6.0,This is a film that is made by a filmmaker at ...


In [None]:
# Function to compute sentiment score for a review
def get_sentiment_score(review):
    result = sentiment_model(review[:512])[0]  # Limit the text length to 512 tokens
    return result['score'] if result['label'] == 'POSITIVE' else -result['score']


In [None]:
X = list(data['Review'])
y = list(data['Rating'])

In [None]:
X_train, X_test, y_train, y_test, X_train_sum, X_test_sum = train_test_split(X, y, list(data['summarized']), test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=5000)
X_train_tfidf =  tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf  = tfidf_vectorizer.transform(X_test).toarray()

In [None]:
X_train_sentiment = np.array([get_sentiment_score(review) for review in X_train_sum])
X_test_sentiment = np.array([get_sentiment_score(review) for review in X_test_sum])

In [None]:
X_train_features = np.hstack((X_train_tfidf, X_train_sentiment[:, np.newaxis]))
X_test_features = np.hstack((X_test_tfidf, X_test_sentiment[:, np.newaxis]))

In [None]:
# modeling
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train_features, y_train)

from sklearn.metrics import r2_score
y_pred_ridge = rf_model.predict(X_test_features)


ridge_score = r2_score(y_test, y_pred_ridge)
print('R2 Score on validation data:', ridge_score)

R2 Score on validation data: 0.7255827736826441


In [None]:
rf_model_ = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model_.fit(X_train_tfidf, y_train)

from sklearn.metrics import r2_score
y_pred_ridge = rf_model_.predict(X_test_tfidf)


ridge_score = r2_score(y_test, y_pred_ridge)
print('R2 Score on validation data:', ridge_score)

R2 Score on validation data: 0.6634899974100852


In [None]:
X_train_embedding = np.array([embedding_model.encode(review[:512]) for review in X_train])
X_test_embedding = np.array([embedding_model.encode(review[:512]) for review in X_test])

In [None]:
X_train_features = np.hstack((X_train_tfidf, X_train_sentiment[:, np.newaxis], X_train_embedding))
X_test_features = np.hstack((X_test_tfidf, X_test_sentiment[:, np.newaxis], X_test_embedding))

In [None]:
# Define SVR model
svr_model = make_pipeline(StandardScaler(), SVR(kernel='linear', C=1.0, epsilon=0.1))

# Train the SVR model
svr_model.fit(X_train_features, y_train)

# Predict on the test set
y_pred = svr_model.predict(X_test_features)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

In [None]:
# modeling
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train_features, y_train)

from sklearn.metrics import r2_score
y_pred_ridge = rf_model.predict(X_test_features)


ridge_score = r2_score(y_test, y_pred_ridge)
print('R2 Score with Ridge Regression on validation data:', ridge_score)