In [1]:
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer
from torch.utils.data import DataLoader, TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


# BERT Pre-train Model

In [2]:
# define BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [3]:
data = pd.read_csv(r"C:\Users\rysha\Downloads\tense.csv", encoding='latin-1')


# Preprocessing Data

In [4]:
tense_labels = {
    'present': 0,
    'future': 1,
    'past': 2,
    'present perfect continuous': 3,
    'future perfect': 4,
    'past perfect': 5,
    'future continuous': 6,
    'past perfect continuous': 7,
    'present continuous': 8,
    'past continuous': 9,
    'future perfect continuous': 10,
    'present perfect': 11,
}

In [5]:
for item in data["tense"]:
    if item not in tense_labels:
        print(f"Tense value '{item}' not found in tense_labels dictionary.")

In [6]:
def clean_and_map_tense(tense):
    if pd.notna(tense):
        tense = tense.lower()
        if tense in tense_labels:
            return tense
    return None

# Clean and map tenses
data["tense"] = data["tense"].apply(clean_and_map_tense)

# Remove rows with None (unrecognized tenses)
data = data.dropna()

data

Unnamed: 0,sentence,tense
0,I am eating breakfast,present
1,She will go to the park,future
2,They played soccer yesterday,past
3,I will be going to the concert,future
4,She is eating lunch now,present
...,...,...
3111,The kids aren't playing in the yard,present continuous
3112,They weren't talking on the phone when I called,past continuous
3113,I won't go to the store after work,future
3114,She doesn't study French every evening,present


In [7]:
# Check and remove unrecognized tenses from tense_labels
unrecognized_tenses = [item for item in data["tense"] if item not in tense_labels]
for unrecognized_tense in unrecognized_tenses:
    del tense_labels[unrecognized_tense]

In [8]:
missing_values = data.isnull().sum()
print("Nan Data:")
print(missing_values)

Nan Data:
sentence                                         0
tense                                            0
dtype: int64


In [9]:
print(data.columns)


Index(['sentence                                     ', 'tense'], dtype='object')


# Train data slipt

In [10]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

def prepare_input_data(data):
    encoded_data = tokenizer(data["sentence                                     "].tolist(), padding=True, truncation=True, return_tensors="pt")
    return encoded_data

In [11]:
train_inputs = prepare_input_data(train_data)
test_inputs = prepare_input_data(test_data)

train_labels = torch.tensor([tense_labels[item] for item in train_data["tense"]])
test_labels = torch.tensor([tense_labels[item] for item in test_data["tense"]])


In [12]:
class TenseClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(TenseClassifier, self).__init__()
        self.bert = bert_model
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        x = self.relu1(pooled_output)
        x = self.relu2(x)
        logits = self.fc(x)
        return logits

num_classes = 12
model = TenseClassifier(bert_model, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [13]:
# Define batch size
batch_size = 32

# Create DataLoader for training data
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Train model

In [14]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(train_data_loader)}')

Epoch 1/10, Average Loss: 1.3887068445865924
Epoch 2/10, Average Loss: 0.33066233839744175
Epoch 3/10, Average Loss: 0.1689595144528609
Epoch 4/10, Average Loss: 0.12296348222746299
Epoch 5/10, Average Loss: 0.09925693841889882
Epoch 6/10, Average Loss: 0.0810499073794255
Epoch 7/10, Average Loss: 0.0661535226763823
Epoch 8/10, Average Loss: 0.06313326601416637
Epoch 9/10, Average Loss: 0.057093651129458196
Epoch 10/10, Average Loss: 0.0495814727858091


In [15]:
# Evaluation
model.eval()
with torch.no_grad():
    logits = model(test_inputs['input_ids'], test_inputs['attention_mask'])
    predicted_labels = torch.argmax(logits, dim=1)
    accuracy = accuracy_score(test_labels, predicted_labels)
    print(f'Accuracy on test set: {accuracy * 100:.2f}%')

Accuracy on test set: 96.96%


In [16]:
# Save model state_dict
torch.save(model.state_dict(), 'tense_classifier_model.pth')

# Save tokenizer
tokenizer.save_pretrained('path_to_save_tokenizer')

('path_to_save_tokenizer\\tokenizer_config.json',
 'path_to_save_tokenizer\\special_tokens_map.json',
 'path_to_save_tokenizer\\vocab.txt',
 'path_to_save_tokenizer\\added_tokens.json')

In [17]:
def predict_tense(sentence, model, tokenizer, tense_labels):
    # tokenizer
    encoded_sentence = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        logits = model(encoded_sentence['input_ids'], encoded_sentence['attention_mask'])
        predicted_label = torch.argmax(logits, dim=1).item()
    
    predicted_tense = [k for k, v in tense_labels.items() if v == predicted_label][0]
    
    return predicted_tense

sentence_to_predict = """i will go to school"""
predicted_tense = predict_tense(sentence_to_predict, model, tokenizer, tense_labels)
print(f"The predicted tense for the sentence is: {predicted_tense}")

The predicted tense for the sentence is: future


In [18]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def convert_tense_with_pos(original_tense, target_tense, sentence):
    # Define tense conversion rules
    tense_conversion_map = {
        ('present', 'future'): 'present continuous',
        ('future', 'past'): 'past perfect',
        ('past', 'present'): 'present perfect',
        # Add more conversion rules as needed
    }
    
    # Look up if conversion rule exists
    if (original_tense, target_tense) in tense_conversion_map:
        converted_tense = tense_conversion_map[(original_tense, target_tense)]
        
        # Tokenize and POS tag the sentence
        tokens = word_tokenize(sentence)
        pos_tags = pos_tag(tokens)
        
        # Convert verb tense based on POS tagging
        converted_sentence = []
        for word, pos in pos_tags:
            if pos.startswith('VB'):  # Check if it's a verb
                # Convert verb tense if needed
                if original_tense == 'present' and target_tense == 'future':
                    # Example conversion: "eat" -> "will eat"
                    word = "will " + word
                # Add more tense conversion rules as needed
                
            converted_sentence.append(word)
        
        # Join the words back into a sentence
        converted_sentence = ' '.join(converted_sentence)
        
        return converted_sentence
    else:
        return None  # Return None if no conversion rule found

def predict_and_convert_tense_with_pos(sentence, model, tokenizer, tense_labels, target_tense):
    # Predict the original tense
    predicted_tense = predict_tense(sentence, model, tokenizer, tense_labels)
    
    # Convert the predicted tense to the target tense
    converted_sentence = convert_tense_with_pos(predicted_tense, target_tense, sentence)
    
    return converted_sentence

# Example usage
target_tense = 'future perfect'
converted_sentence = predict_and_convert_tense_with_pos(sentence_to_predict, model, tokenizer, tense_labels, target_tense)
print(f"The converted sentence with POS tagging is: {converted_sentence}")


The converted sentence with POS tagging is: None


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rysha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rysha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [21]:
import spacy
from pattern.en import conjugate, lemma, lexeme

ModuleNotFoundError: No module named 'pattern'

In [20]:
pip install spacy

Collecting spacy
  Downloading spacy-3.7.4-cp38-cp38-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp38-cp38-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp38-cp38-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp38-cp38-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp38-cp38-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp38-cp38-win_amd64.whl.metad

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.11.0 which is incompatible.
