<a href="https://colab.research.google.com/github/aaryapandya12/E-commerce/blob/main/language_detection_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('Language Detection.csv')

df['Text'] = df['Text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

X = df['Text']
y = df['Language']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        analyzer='char',
        ngram_range=(1, 4),
        min_df=3,
        max_features=10000
    )),
    ('clf', MultinomialNB(
        alpha=0.1,
        fit_prior=True
    ))
])

logreg = LogisticRegression(
    max_iter=1000,
    C=1.0,
    solver='lbfgs',
    multi_class='multinomial'
)

ensemble = VotingClassifier(
    estimators=[
        ('nb', pipeline),
        ('lr', Pipeline([
            ('tfidf', TfidfVectorizer(
                analyzer='char',
                ngram_range=(1, 4),
                min_df=3,
                max_features=10000
            )),
            ('clf', logreg)
        ]))
    ],
    voting='soft',
    weights=[0.6, 0.4]
)

print("Training model...")
ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred, target_names=label_encoder.classes_))

import joblib
joblib.dump(ensemble, 'language_detection_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

Training model...





Accuracy: 0.9888781431334622

Classification Report:
               precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       107
      Danish       0.99      0.97      0.98        86
       Dutch       0.98      0.97      0.98       109
     English       0.98      0.99      0.99       277
      French       0.98      0.99      0.98       203
      German       1.00      0.99      0.99        94
       Greek       1.00      1.00      1.00        73
       Hindi       1.00      1.00      1.00        12
     Italian       0.99      0.98      0.98       140
     Kannada       1.00      1.00      1.00        74
   Malayalam       1.00      1.00      1.00       119
  Portugeese       0.98      0.99      0.98       148
     Russian       1.00      0.99      0.99       138
     Spanish       0.99      0.98      0.99       164
    Sweedish       0.99      0.99      0.99       135
       Tamil       1.00      1.00      1.00        94
     Turkish       0.99   

['label_encoder.pkl']