<a href="https://colab.research.google.com/github/beastboy-93/projects/blob/main/language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import pandas as pd
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from indicnlp.tokenize import indic_tokenize
import pickle

# Mount Google Drive
try:
    drive.mount('/content/drive')
except Exception as e:
    print(f"Error mounting Google Drive: {str(e)}")
    exit()

# Define language mappings (filename to ISO code)
language_mappings = {
    "Hindi.txt": "hi",
    "English.txt": "en",
    "Kannada.txt": "kn",
    "Malayalam.txt": "ml",
    "Odia.txt": "or",
    "Urdu.txt": "ur",
    "Arabic.txt": "ar",
    "Tamil.txt": "ta",
    "Telugu.txt": "te",
    "Bengali.txt": "bn"
}

# Function to verify dataset integrity
def verify_dataset(file_path, lang_code):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = [line.strip() for line in f if line.strip()]
        if not lines:
            print(f"Warning: {file_path} is empty")
            return False
        if len(lines) < 1000:
            print(f"Warning: {file_path} has only {len(lines)} sentences (expected ~10,000)")
            return False
        # Check script-specific characters
        if lang_code == "ur" and not any(any(c in line for c in "ابپتثجچحخدذرزژسشصضطظعغفقکلمنوهیے") for line in lines[:100]):
            print(f"Warning: {file_path} lacks Urdu characters")
            return False
        if lang_code == "ar" and not any(any(c in line for c in "ابتثجحخدذرزسشصضطظعغفقكلمنهوي") for line in lines[:100]):
            print(f"Warning: {file_path} lacks Arabic characters")
            return False
        if lang_code == "hi" and not any(any(c in line for c in "अआइईउऊएऐओऔकखगघचछजझटठडढणतथदधनपफबभमयरलवशषसह") for line in lines[:100]):
            print(f"Warning: {file_path} lacks Hindi characters")
            return False
        print(f"Verified: {file_path} looks good ({len(lines)} sentences)")
        return True
    except Exception as e:
        print(f"Error verifying {file_path}: {str(e)}")
        return False

# Function to clean and tokenize text
def clean_text(text, lang_code):
    # Remove symbols/numbers, keep script-specific characters
    valid_chars = (
        set("abcdefghijklmnopqrstuvwxyz ") if lang_code == "en" else
        set("।") | set(c for c in text if c.isalpha())
    )
    text = ''.join(c for c in text if c in valid_chars)

    # Tokenize (character-level for deep learning)
    if lang_code == "en":
        tokens = list(text)
    else:
        try:
            tokens = list(text)  # Character-level for Indic/Arabic/Urdu
        except:
            tokens = list(text)  # Fallback
    return ' '.join(tokens).strip()

# Combine datasets into a single DataFrame
def combine_datasets(input_dir="/content/drive/MyDrive/Languages"):
    datasets = []
    for filename, lang_code in language_mappings.items():
        file_path = os.path.join(input_dir, filename)
        if not os.path.exists(file_path):
            print(f"Warning: {file_path} not found")
            continue
        if not verify_dataset(file_path, lang_code):
            print(f"Skipping {file_path} due to verification failure")
            continue
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                texts = [clean_text(line.strip(), lang_code) for line in f if line.strip()]
                texts = [t for t in texts if t]  # Remove empty strings
                if not texts:
                    print(f"Warning: {file_path} produced no valid texts after cleaning")
                    continue
                datasets.append(pd.DataFrame({"text": texts, "language": lang_code}))
        except Exception as e:
            print(f"Error reading {file_path}: {str(e)}")
            continue
    if not datasets:
        print("Error: No valid datasets found")
        return None
    combined = pd.concat(datasets, ignore_index=True)
    print(f"Combined dataset: {len(combined)} sentences")
    return combined

# Preprocess data for deep learning
def preprocess_data(df, max_len=100, vocab_size=5000):
    # Tokenize at character level
    tokenizer = Tokenizer(num_words=vocab_size, char_level=True, oov_token="<OOV>")
    tokenizer.fit_on_texts(df["text"])

    # Convert texts to sequences
    sequences = tokenizer.texts_to_sequences(df["text"])
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

    # Encode labels
    label_map = {lang: idx for idx, lang in enumerate(df["language"].unique())}
    labels = df["language"].map(label_map).values
    labels = to_categorical(labels)

    return padded_sequences, labels, tokenizer, label_map

# Build CNN model
def build_model(vocab_size=5000, max_len=100, num_classes=10):
    model = Sequential([
        Embedding(vocab_size, 128, input_length=max_len),
        Conv1D(128, 5, activation="relu"),
        GlobalMaxPooling1D(),
        Dense(128, activation="relu"),
        Dropout(0.5),
        Dense(num_classes, activation="softmax")
    ])
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model

# Train and evaluate model
def train_model(output_model_path="/content/drive/MyDrive/best_language_model.keras"):
    # Load and preprocess data
    df = combine_datasets()
    if df is None:
        return None, None, None

    X, y, tokenizer, label_map = preprocess_data(df)

    # Split data
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Build model
    model = build_model(num_classes=y.shape[1])

    # Define callbacks
    checkpoint = ModelCheckpoint(
        output_model_path,
        monitor="val_accuracy",
        save_best_only=True,
        mode="max",
        verbose=1
    )
    early_stopping = EarlyStopping(
        monitor="val_accuracy",
        patience=3,
        mode="max",
        restore_best_weights=True,
        verbose=1
    )

    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=64,
        validation_data=(X_val, y_val),
        callbacks=[checkpoint, early_stopping]
    )

    # Save tokenizer and label_map
    with open("/content/drive/MyDrive/tokenizer.pkl", "wb") as f:
        pickle.dump(tokenizer, f)
    with open("/content/drive/MyDrive/label_map.pkl", "wb") as f:
        pickle.dump(label_map, f)

    # Evaluate on test set
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    # Map back to language codes
    reverse_label_map = {idx: lang for lang, idx in label_map.items()}
    y_pred_labels = [reverse_label_map[p] for p in y_pred_classes]
    y_test_labels = [reverse_label_map[t] for t in y_test_classes]

    # Print evaluation
    print(f"Test Accuracy: {accuracy_score(y_test_labels, y_pred_labels):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test_labels, y_pred_labels))

    return model, tokenizer, label_map

# Main execution
if __name__ == "__main__":
    # Train model
    model, tokenizer, label_map = train_model()
    if model is None:
        print("Failed to train model")
        exit()

    # Test predictions
    test_sentences = [
        "سلم صمو ہے۔",  # Urdu
        "قال سمو هو.",   # Arabic
        "कगणि समो है।",  # Hindi
        "salu is bret."   # English
    ]

    # Preprocess test sentences
    test_sequences = tokenizer.texts_to_sequences(test_sentences)
    test_padded = pad_sequences(test_sequences, maxlen=100, padding="post", truncating="post")

    # Predict
    predictions = model.predict(test_padded)
    pred_classes = np.argmax(predictions, axis=1)
    reverse_label_map = {idx: lang for lang, idx in label_map.items()}
    pred_labels = [reverse_label_map[p] for p in pred_classes]

    print("\nPredictions:")
    for sentence, pred in zip(test_sentences, pred_labels):
        print(f"Sentence: {sentence} -> Predicted Language: {pred}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Verified: /content/drive/MyDrive/Languages/Hindi.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/English.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Kannada.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Malayalam.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Odia.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Urdu.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Arabic.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Tamil.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Telugu.txt looks good (10000 sentences)
Verified: /content/drive/MyDrive/Languages/Bengali.txt looks good (10000 sentences)
Combined dataset: 100000 sentences
Epoch 



[1m1244/1250[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9234 - loss: 0.3034
Epoch 1: val_accuracy improved from -inf to 1.00000, saving model to /content/drive/MyDrive/best_language_model.keras
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9237 - loss: 0.3021 - val_accuracy: 1.0000 - val_loss: 5.0407e-07
Epoch 2/20
[1m1246/1250[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 1.7481e-04
Epoch 2: val_accuracy did not improve from 1.00000
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 1.0000 - loss: 1.7458e-04 - val_accuracy: 1.0000 - val_loss: 1.6105e-08
Epoch 3/20
[1m1240/1250[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 4.6115e-05
Epoch 3: val_accuracy did not improve from 1.00000
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 1.0000 - 