In [42]:
import pandas as pd

df = pd.read_csv("date.csv").drop_duplicates(subset = "informal_date")
df.shape

(9251, 2)

#Normalize and split Data

In [43]:
import re
from sklearn.model_selection import train_test_split

def normalize_text(text):
    # Convert Persian numbers to Latin
    persian_to_latin = {
        '۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4',
        '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9'
    }
    for persian, latin in persian_to_latin.items():
        text = text.replace(persian, latin)
    
    # Normalize Persian characters
    text = text.replace('ي', 'ی').replace('ك', 'ک')
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text.strip())
    # text = f"<start> {text} <ends>"
    return text

df['formal_date'] = df['formal_date'].apply(normalize_text)
df['informal_date'] = df['informal_date'].apply(normalize_text)

df

Unnamed: 0,informal_date,formal_date
0,پنجم اردیبهشت 1402,1402/02/05
1,17 آبان 1395,1395/08/17
2,1 1403 مهر,1403/07/01
3,سال مهر اول 1403,1403/07/01
4,1403-07-01,1403/07/01
...,...,...
29435,پانزدهم آبان 1307,1307/08/15
29436,هجدهم آذر 1306,1306/09/18
29437,بیست و دوم دی 1305,1305/10/22
29439,پنجم اسفند 1303,1303/12/05


In [32]:
train_val , test = train_test_split(df , test_size=0.1 , random_state= 42)
train ,val = train_test_split(train_val,test_size=0.1,random_state=42)

In [33]:
print(f"Training samples: {len(train)}")
print(f"Validation samples: {len(val)}")
print(f"Test samples: {len(test)}")

Training samples: 7492
Validation samples: 833
Test samples: 926


In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

class DateDataset(Dataset):
    def __init__(self, informal_dates, formal_dates, informal_vocab, formal_vocab):
        self.informal_dates = informal_dates
        self.formal_dates = formal_dates
        self.informal_vocab = informal_vocab
        self.formal_vocab = formal_vocab

    def __len__(self):
        return len(self.informal_dates)

    def __getitem__(self, idx):
        informal = [self.informal_vocab.get(token, self.informal_vocab['<UNK>']) for token in self.informal_dates[idx].split()]
        formal = [self.formal_vocab.get(token, self.formal_vocab['<UNK>']) for token in self.formal_dates[idx].split()]
        return torch.tensor(informal), torch.tensor(formal)

def create_vocab(texts):
    vocab = {'<PAD>': 0, '<UNK>': 1, '<start>': 2, '<ends>': 3}
    for text in texts:
        for token in text.split():
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

def collate_fn(batch):
    informal, formal = zip(*batch)
    
    # Find max length in this batch
    max_len = max(max(len(seq) for seq in informal), max(len(seq) for seq in formal))
    
    # Pad sequences to max_len
    informal_padded = [torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)]) for seq in informal]
    formal_padded = [torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)]) for seq in formal]
    
    return torch.stack(informal_padded), torch.stack(formal_padded)

class DateConversionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout=0.1):
        super(DateConversionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        outputs, _ = self.lstm(embedded)
        predictions = self.fc(outputs)
        return predictions

# Prepare data
informal_vocab = create_vocab(df['informal_date'])
formal_vocab = create_vocab(df['formal_date'])

X_train, X_test, y_train, y_test = train_test_split(df['informal_date'], df['formal_date'], test_size=0.2, random_state=42)

train_dataset = DateDataset(X_train.tolist(), y_train.tolist(), informal_vocab, formal_vocab)
test_dataset = DateDataset(X_test.tolist(), y_test.tolist(), informal_vocab, formal_vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Model parameters
input_size = len(informal_vocab)
output_size = len(formal_vocab)
hidden_size = 256
num_layers = 2
dropout = 0.1
learning_rate = 0.001
num_epochs = 5

# Initialize model
model = DateConversionLSTM(input_size, hidden_size, output_size, num_layers, dropout)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # 0 is the pad_token_id
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for informal, formal in train_loader:
        informal, formal = informal.to(device), formal.to(device)
        
        optimizer.zero_grad()
        output = model(informal)
        loss = criterion(output.contiguous().view(-1, output.size(-1)), formal.contiguous().view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

print("Training complete!")

# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for informal, formal in test_loader:
        informal, formal = informal.to(device), formal.to(device)
        output = model(informal)
        _, predicted = torch.max(output, dim=2)
        correct += (predicted == formal).sum().item()
        total += formal.numel()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

# Function to convert a single date
def convert_date(model, input_text, informal_vocab, formal_vocab):
    model.eval()
    input_tokens = [informal_vocab.get(token, informal_vocab['<UNK>']) for token in input_text.split()]
    input_tensor = torch.tensor([input_tokens]).to(device)
    
    with torch.no_grad():
        output = model(input_tensor)
    
    _, predicted = torch.max(output, dim=2)
    predicted_tokens = predicted[0].tolist()
    
    rev_formal_vocab = {v: k for k, v in formal_vocab.items()}
    converted_date = ' '.join([rev_formal_vocab[token] for token in predicted_tokens if token != 0])
    return converted_date


Epoch 1/5, Loss: 8.6027
Epoch 2/5, Loss: 8.0952
Epoch 3/5, Loss: 7.4294
Epoch 4/5, Loss: 6.9368
Epoch 5/5, Loss: 6.5909
Training complete!
Test Accuracy: 0.0000
Input: <start> 1 1403 مهر <ends>
Converted: 1403/01/01 1396/06/01 1400/03/01 1400/10/01 1400/03/01


In [1]:
# Example usage
input_date = "<start> 23 1385 شهریور <ends>"
converted_date = convert_date(model, input_date, informal_vocab, formal_vocab)
print(f"Input: {input_date}")
print(f"Converted: {converted_date}")

NameError: name 'convert_date' is not defined