<a href="https://colab.research.google.com/github/Yashwithareddy01/yashwitha/blob/main/kannada_disamb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import json

# Ensure NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
def load_dataset():
    file_path = '/content/drive/MyDrive/Datasets/Dataset_kannada_english.json'
    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    data = []
    for item in dataset:
        word = item['word']
        translation = item['translation']
        senses = item['senses']
        meaning = item['disambiguation'].split(' or ')  # Split multiple meanings
        data.append({'Word': word, 'Translation': translation, 'Senses': senses, 'Meanings': meaning})

    return pd.DataFrame(data)

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Model Training
def train_model(dataset):
    dataset['Preprocessed'] = dataset['Meanings'].apply(lambda meanings: ' '.join(meanings))
    X = dataset['Preprocessed']
    y = dataset['Senses']

    # Vectorization using TF-IDF
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Using unigrams and bigrams
    X = vectorizer.fit_transform(X)

    # Handle Class Imbalance
    smote = SMOTE()
    X, y = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Using RandomForestClassifier for better accuracy
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Hyperparameter tuning
    parameters = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
    }

    grid_search = GridSearchCV(model, parameters, cv=5)
    grid_search.fit(X_train, y_train)

    # Test model accuracy
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy after tuning: {accuracy:.2f}")
    return grid_search, vectorizer

# Measure Performance
def measure_performance(dataset, model, vectorizer):
    test_data = dataset.sample(frac=0.2, random_state=42)
    test_X = vectorizer.transform(test_data['Preprocessed'])
    test_y = test_data['Senses']
    predicted_y = model.predict(test_X)
    report = classification_report(test_y, predicted_y)
    accuracy = accuracy_score(test_y, predicted_y)
    return report, accuracy

# Main function
def main():
    dataset = load_dataset()
    model, vectorizer = train_model(dataset)
    report, accuracy = measure_performance(dataset, model, vectorizer)
    print("Classification Report:\n", report)
    print("Accuracy:", accuracy)

    # Get user input
    text = input("Enter a sentence for disambiguation: ")
    word_senses = predict_sense(text, dataset, model, vectorizer)

    print("Disambiguated Word Senses:")
    for word, meaning in word_senses.items():
        print(f"Word: {word}, Meaning: {meaning}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model accuracy after tuning: 0.95
Classification Report:
               precision    recall  f1-score   support

           2       0.95      0.98      0.97        59
           3       0.83      0.62      0.71         8
           4       1.00      1.00      1.00         1

    accuracy                           0.94        68
   macro avg       0.93      0.87      0.89        68
weighted avg       0.94      0.94      0.94        68

Accuracy: 0.9411764705882353
Enter a sentence for disambiguation: Nadi
Disambiguated Word Senses:
Word: nadi, Meaning: vein
