In [2]:
!pip install -U scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the data from the provided URL
url = "https://raw.githubusercontent.com/YashiGarg016/Language-Detection/refs/heads/main/Language%20Detection.csv"
try:
  df = pd.read_csv(url)
except Exception as e:
  print(f"Error loading data: {e}")
  exit()


# Prepare the data
X = df["Text"]
y = df["Language"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Maximum Entropy classifier (Logistic Regression)
classifier = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
classifier.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_vec)

# Evaluate the classifier
print(classification_report(y_test, y_pred))


# Function to predict the language of user input
def predict_language(text):
    text_vec = vectorizer.transform([text])
    predicted_language = classifier.predict(text_vec)[0]
    return predicted_language

# Example usage
user_input = "This is a test sentence."
predicted_lang = predict_language(user_input)
print(f"The predicted language for '{user_input}' is: {predicted_lang}")

user_input = "Ceci est une phrase de test."
predicted_lang = predict_language(user_input)
print(f"The predicted language for '{user_input}' is: {predicted_lang}")

              precision    recall  f1-score   support

      Arabic       1.00      0.92      0.96       106
      Danish       0.96      0.89      0.92        73
       Dutch       0.99      0.94      0.96       111
     English       0.98      0.99      0.98       291
      French       1.00      0.96      0.98       219
      German       0.99      0.96      0.97        93
       Greek       1.00      0.94      0.97        68
       Hindi       1.00      0.80      0.89        10
     Italian       0.98      0.92      0.95       145
     Kannada       1.00      1.00      1.00        66
   Malayalam       1.00      0.97      0.98       121
  Portugeese       0.99      0.94      0.96       144
     Russian       0.71      0.99      0.83       136
     Spanish       0.91      0.96      0.93       160
    Sweedish       0.98      0.96      0.97       133
       Tamil       1.00      0.98      0.99        87
     Turkish       1.00      0.93      0.97       105

    accuracy              