## Imports

In [1]:
# Install dependencies
## pandas: for data preprocessing
!pip install pandas
## scikit-learn: for model training
!pip install scikit-learn

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/56/1b/4ae75a5f50e4c703a1b21f1b8a95b039040f8f53f9767816d87b6c5fd2bb/pandas-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata
  Downloading pandas-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (18 kB)
Collecting numpy>=1.23.2 (from pandas)
  Obtaining dependency information for numpy>=1.23.2 from https://files.pythonhosted.org/packages/a9/84/baf694be765d68c73f0f8a9d52151c339aed5f2d64205824a6f29021170c/numpy-1.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K    

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import joblib

## Data Exploration

In [3]:
data = pd.read_csv("./language_detection.csv")
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
data["Language"].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

### Correct spelling mistakes

In [5]:
replacements = {
    "Sweedish": "Swedish",
    "Portugeese": "Portuguese"
}
for old, new in replacements.items():
    print("Renaming language", old, "to", new, "...")
    data.loc[data["Language"] == old, "Language"] = new

data["Language"].value_counts()

Renaming language Sweedish to Swedish ...
Renaming language Portugeese to Portuguese ...


Language
English       1385
French        1014
Spanish        819
Portuguese     739
Italian        698
Russian        692
Swedish        676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

## Model

### Training

In [6]:
def train_model(data):
    x = np.array(data["Text"])
    y = np.array(data["Language"])
    cv = CountVectorizer()
    X = cv.fit_transform(x)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    print(f"# DATASET\nDataset size: {X.shape[0]} items, \nTrain size: {X_train.shape[0]} items, \nTest size: {X_test.shape[0]} items\n")
    model = MultinomialNB()
    model.fit(X_train, y_train)

    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    print(f"# ACCURACY\nTrain accuracy: {train_accuracy:.2%}\nTest accuracy: {test_accuracy:.2%}")
    return model, cv

model, cv = train_model(data)

# DATASET
Dataset size: 10337 items, 
Train size: 8269 items, 
Test size: 2068 items

# ACCURACY
Train accuracy: 99.12%
Test accuracy: 98.40%


### Saving the model

In [7]:
def save_model(model, vectorizer, filename = "multinomial_language_detector.joblib"):
    full_model = (model, vectorizer)
    paths = joblib.dump(full_model, filename)
    for path in paths:
        print("Saved model to", path)

save_model(model, cv)

Saved model to multinomial_language_detector.joblib


### Loading the model

In [8]:
model, cv = joblib.load("multinomial_language_detector.joblib")
texts = ["Guten Tag"]
vectorized = cv.transform(texts)
prediction = model.predict(vectorized)[0]
print("Prediction:", prediction)

Prediction: German
