<a href="https://www.kaggle.com/code/anandtalware/language-detection-practice-8-september-2023?scriptVersionId=143200366" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Language Detection 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# nltk imports

In [None]:
import nltk 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer



# sklearn imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from wordcloud import WordCloud

# Naive Bayes- MultinomialNB
from sklearn.naive_bayes import MultinomialNB

# Data fetching in dataframe form

In [None]:
data = pd.read_csv('/kaggle/input/language-detection/Language Detection.csv')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
print('Before any cleaning in dataset: ')
print('Total Records in dataset: ',data.shape[0])
print('Total Features in dataset: ', data.shape[1])

# Check missing values in dataset

In [None]:
data.isnull().sum()

- Here is not any null value in our dataset.

# Check Duplicated records in dataset

In [None]:
data.duplicated().sum()

- Here in dataset, we have 66  duplicate records, so we have to drop them.

In [None]:
# dropping duplicates
data.drop_duplicates(inplace=True)

# check again
data.duplicated().sum()

# Normal data information

In [None]:
data.info()

In [None]:
print('After any cleaning in dataset: ')
print('Total Records in dataset: ',data.shape[0])
print('Total Features in dataset: ', data.shape[1])

# EDA before preprocessing

### Value counts of target column

In [None]:
data['Language'].value_counts()

In [None]:
# contplot for Language columns
sns.countplot(y=data['Language'])
plt.title('Value Counts for languages')

plt.show()

# Data Preprocessing

In [None]:
# create fuction to preprocess test data

import string
punc = string.punctuation

# Define a translation table to remove punctuations
translator = str.maketrans('', '', punc)

def preprocess_text(text):
    # convert in lower case
    lower_text = text.lower()
    
    # word tokenization
    tokens = word_tokenize(text)
    
    
    # remove special charactors and punctuations
    tokens2 = [token.translate(translator) for token in tokens if token not in punc]
    
    # stemming
    stm = PorterStemmer()
    stemmed_tokens = [stm.stem(token) for token in tokens2]
    
    preprocessed_text = ' '.join(stemmed_tokens)
    return  preprocessed_text

print(preprocess_text(data['Text'][1]))

# Creat a new dataset which also has preprocessed text

In [None]:
data1 = data[['Text', 'Language']]
data1['Preprocessed_text'] = data1['Text'].apply(preprocess_text)

In [None]:
data1.head()

In [None]:
# check number of unique languages in dataset
data1['Language'].nunique()

In [None]:
from wordcloud import WordCloud 
wc = WordCloud(width=800, height=500, min_font_size=15, background_color='white')

In [None]:
### 1. wordcloud for english language
english_df = data1[data1['Language']=='English']

english_words = english_df['Preprocessed_text'].str.cat(sep=' ')

english_wc = wc.generate(english_words)
plt.figure(figsize=(8,8))
plt.imshow(english_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
data1['Language'].unique()

In [None]:
languages = data1['Language'].unique()
for lang in languages:
    print()
    print('-'*10, lang,'Wordcloud','-'*10)
    lang_df = data1[data1['Language']==lang]

    lang_words = lang_df['Preprocessed_text'].str.cat(sep=' ')

    lang_wc = wc.generate(lang_words)
    plt.figure(figsize=(8,8))
    plt.imshow(lang_wc)
    plt.axis('off')
    plt.show()
    print('=='*30)

In [None]:
data1[data1['Language']=='Hindi']

# checking words counts in each language

In [None]:
from collections import Counter

languages = data1['Language'].unique()
a = 1
for lang in languages:
    print()
    lang_corpus = []
    lang_text = data1[data1['Language']==lang]['Preprocessed_text'].tolist()
    for i in lang_text:
        for word in i.split():
            lang_corpus.append(word)
    print(f"{a}) Number of words in {lang} : ", len(lang_corpus))
    print(f'Top 15 words in {lang}:', Counter(lang_corpus).most_common(15))
    print('='*20)
    a+=1

# # Split data as indepedent and dependent features

In [None]:
X = data1['Preprocessed_text']
y = data1['Language']

 # Train test split

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
print('shape of X:', X.shape)
print()
print('shape of X_train: ',X_train.shape)
print('shape of ty_train: ',y_train.shape)
print('shape of X_test: ',X_test.shape)
print('shape of y_test: ',y_test.shape)

In [None]:
print('number of unique langues in y_train: ',y_train.nunique())
print('number of unique langues in y_test: ',y_test.nunique())

# label encoder for talrget column

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Text Vectorization
## 1. CountVectorization

In [None]:
# countvectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train).toarray()
X_test_cv = cv.transform(X_test).toarray()

print('shape of X:', X.shape)
print()
print('shape of X_train_cv: ',X_train_cv.shape)
print('shape of X_test_cv: ',X_test_cv.shape)


# 2. Tf-idf Vectorization

In [None]:
# tf-idf vectorizer
tv = TfidfVectorizer(max_features=2500)
X_train_tv = tv.fit_transform(X_train).toarray()
X_test_tv = tv.transform(X_test).toarray()

print('shape of X:', X.shape)
print()
print('shape of X_train_tv: ',X_train_tv.shape)
print('shape of X_test_tv: ',X_test_tv.shape)

# Machine Learning algorithms

In [None]:
# first we check only for Naive Bayes MultinomialNB with countvectorized data
mnb = MultinomialNB()
mnb.fit(X_train_cv, y_train_encoded)
y_pred_mnb_cv = mnb.predict(X_test_cv)


print("accuracy score for mnb: ",accuracy_score(y_test_encoded, y_pred_mnb_cv))
print()
print("pricision score for mnb: ",precision_score(y_test_encoded,y_pred_mnb_cv, average='micro'))

In [None]:
# multinomialNB with tfidf
mnb2 = MultinomialNB()
mnb2.fit(X_train_tv, y_train_encoded)
y_pred_mnb_tv = mnb2.predict(X_test_tv)


print("accuracy score for mnb: ",accuracy_score(y_test_encoded, y_pred_mnb_tv))
print()
print("pricision score for mnb: ",precision_score(y_test_encoded,y_pred_mnb_tv, average='micro'))


### Here we have got better accuracy and precision score with CountVectorized data on Naive Bayes MultinomialNB Machine Learning Model, we store it in .pkl file by using joblib or pickle module for further website or app developement process.

In [None]:
import joblib

joblib.dump(mnb, 'multinomialnb.pkl')