In [None]:
#import libraries
import pandas as pd
import string
import re
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import pickle
warnings.filterwarnings('ignore')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
#reading the sentence dataset from tatoeba
sentences = pd.read_csv('/kaggle/input/tatoeba/sentences_detailed.csv', sep='\t', header= None, names =  ['id', 'lang', 'sentence', 'user', 'date1', 'date2'])

In [None]:
sentences

In [None]:
sentences.info()

In [None]:
sentences.shape

In [None]:
sentences['lang'].value_counts()

In [None]:
#data preprocessing
languages = pd.DataFrame(columns=sentences.columns)


unique_languages = sentences['lang'].unique()
for lang in unique_languages:
    # Filter sentences for the current language
    lang_sentences = sentences[sentences['lang'] == lang]
    
    
    if len(lang_sentences) >= 200 :
        # Get 100 random sentences for each language
        lang_sentences = lang_sentences.sample(200, random_state=42)
        languages = pd.concat([languages, lang_sentences])
    elif 150 <= len(lang_sentences) <= 200:
        # Sample 100 random sentences for languages in the specified range
        lang_sentences = lang_sentences.sample(150, random_state=42)
        languages = pd.concat([languages, lang_sentences])
    elif 100 <= len(lang_sentences) <= 150:
        # Sample 100 random sentences for languages in the specified range
        lang_sentences = lang_sentences.sample(100, random_state=42)
        languages = pd.concat([languages, lang_sentences])
    
    

languages = languages.reset_index(drop=True)



In [None]:
languages

In [None]:
languages['lang'].value_counts()

In [None]:
#function for removing symbols and numbers from the sentences
def removeSymbolsAndNumbers(text):        
        text = re.sub(r'[{}]'.format(string.punctuation), '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[@]', '', text)

        return text.lower()

In [None]:
X1 = languages['sentence'].apply(removeSymbolsAndNumbers)
X1

In [None]:
y = languages['lang']

In [None]:
#splitting the training and testing data
x_train, x_test, y_train, y_test = train_test_split(X1,y, random_state=42)

In [None]:
# Check for missing values in x_train
missing_x = x_train.isna().sum()

# Check for missing values in y_train
missing_y = y_train.isna().sum()

print("Missing values in x_train:", missing_x)
print("Missing values in y_train:", missing_y)


In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), analyzer='char')

In [None]:
model = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression())
])

In [None]:
#training the model
model.fit(x_train,y_train)

In [None]:
#initializing prediction, accuracy and confusion matrix
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

In [None]:
#checking the accuracy
print("Accuracy is :",accuracy)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
#plotting the confusion matrix
plt.figure(figsize=(15,10))
sns.heatmap(cm, annot = True)
plt.show()

In [None]:
def predict(text):
    language = model.predict([text])
    print('The Language is in',language[0])

In [None]:
#predicting the language 
predict("こんにちは、元気ですか")#japanese
predict("hi how are you ")#english
predict("ஹாய் எப்படி இருக்கிறீர்கள் ")#tamil
predict("हैलो, क्या हाल हैं ")#hindi
predict("ഹായ്, സുഖമാണോ ")#malayalam
predict("Привет, как дела ")#russian
predict("hola, cómo estás ")#spanish
predict("VÉRIFICATION DU MODÈLE DE DÉTECTION DE LA LANGUE")#french
predict("توففحص نموذج الكشف عن اللغة")#arabic
predict("我們試試看！")#mandarin

In [None]:
#to save the model locally
import pickle
with open('lang_model','wb') as f:
    pickle.dump(model,f)