In [None]:
!pip install sklearn-crfsuite

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
from collections import Counter

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/YashiGarg016/Language-Detection/refs/heads/main/Language%20Detection.csv')

# Ensure 'Text' column is string type
df['Text'] = df['Text'].astype(str)

# Feature extraction function (word-based instead of character-based)
def word2features(sentence, i):
    """Extracts features for a given word in a sentence."""
    word = sentence[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        prev_word = sentence[i-1]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence
    if i < len(sentence) - 1:
        next_word = sentence[i+1]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence
    return features

def sent2features(sentence):
    """Converts a sentence into a list of word feature dictionaries."""
    return [word2features(sentence, i) for i in range(len(sentence))]

def sent2labels(language, sentence):
    """Assigns the same language label to each word in a sentence."""
    return [language] * len(sentence)

# Tokenize sentences into words
df['Tokenized_Text'] = df['Text'].apply(lambda x: x.split())

# Prepare data for CRF
X = [sent2features(sentence) for sentence in df['Tokenized_Text']]
y = [sent2labels(lang, sentence) for sentence, lang in zip(df['Tokenized_Text'], df['Language'])]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the CRF model
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=50,  # Reduced to speed up training
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = crf.predict(X_test)
print(flat_classification_report(y_test, y_pred))

# Function for language prediction
def predict_language(text):
    """Predicts the language of an input sentence."""
    tokenized_text = text.split()  # Tokenize input into words
    features = sent2features(tokenized_text)
    predicted_labels = crf.predict_single(features)  # Predict for each word
    most_common_lang = Counter(predicted_labels).most_common(1)[0][0]  # Majority voting
    return most_common_lang

# Example usage of the prediction function
user_input = input("Enter text to classify: ")
predicted_lang = predict_language(user_input)
print(f"Predicted language: {predicted_lang}")