In [None]:
import pandas as pd

df = pd.read_csv("date.csv").drop_duplicates(subset = "informal_date")
df.shape

In [None]:
!pip install tensorboard
!pip install comet-ml
!pip install datasets

#Normalize and split Data

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [32]:
import pandas as pd

df = pd.read_csv("/content/Text2Date/date.csv").drop_duplicates(subset = "informal_date")

df

In [None]:
import re
from sklearn.model_selection import train_test_split

def normalize_text(text):
    # Convert Persian numbers to Latin
    persian_to_latin = {
        '۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4',
        '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9'
    }
    for persian, latin in persian_to_latin.items():
        text = text.replace(persian, latin)

    # Normalize Persian characters
    text = text.replace('ي', 'ی').replace('ك', 'ک')

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text.strip())
    return text

df['formal_date'] = df['formal_date'].apply(normalize_text)
df['informal_date'] = df['informal_date'].apply(normalize_text)

df

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

data = pd.DataFrame(df)
dataset = Dataset.from_pandas(data)
dataset

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
# 2. Tokenization
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def preprocess_function(examples):
    inputs = ["convert date: " + text for text in examples['informal_date']]
    targets = examples['formal_date']
    model_inputs = tokenizer(inputs, max_length=32, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=32, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# 3. Split the Dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']


In [None]:
# 4. Model Initialization
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# 5. Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    gradient_checkpointing=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps =5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    learning_rate=1e-4,
    weight_decay=0.01,
    save_total_limit=1,
    torch_empty_cache_steps =5,
    warmup_steps=500
)
# 6. Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# 7. Model Training
trainer.train()

# 8. Evaluation
results = trainer.evaluate()
print(results)


In [None]:
def predict_date(informal_date):
    # preparing input
    input_text = "convert date: " + informal_date
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)

    # generate output
    output = model.generate(input_ids, max_length=32, num_beams=4, early_stopping=True)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    return decoded_output

# examples
test_dates = ["12 اردیبهشت 1356", "سال چهارم شهریور 1325", "1392-05-03"]
for date in test_dates:
    print(f"تاریخ غیررسمی: {date} -> تاریخ رسمی: {predict_date(date)}")

In [None]:
from sklearn.metrics import accuracy_score, f1_score

results = trainer.evaluate()
results

In [None]:

half_test_size = len(test_dataset) // 4
small_test_dataset = test_dataset.select(range(half_test_size))
predictions = trainer.predict(small_test_dataset)
pred_labels = predictions.predictions[0].argmax(-1) 
true_labels = predictions.label_ids  

pred_labels_flat = pred_labels.flatten()
true_labels_flat = true_labels.flatten()


accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
f1 = f1_score(true_labels_flat, pred_labels_flat, average='weighted')

print(f"accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
