# Language Detection NLP model 

In [21]:
import string 
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt 
import seaborn as sms 

In [22]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


## Data preprocessing and exploration

In [23]:
all_punctuation = string.punctuation

def count_punctuation(text):
    count = sum(1 for char in text if char in all_punctuation)
    return count

total_punctuation_count = df['Text'].apply(count_punctuation).sum()

print(total_punctuation_count)

34155


In [24]:
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun, "")
    text = text.lower()
    return text

df['Text'] = df['Text'].apply(remove_pun)

updated_punctuation_count = df['Text'].apply(count_punctuation).sum()

print(updated_punctuation_count)

0


## Train Test Split the Data set

In [25]:
from sklearn.model_selection import train_test_split
X = df.iloc[:,0]
Y = df.iloc[:,1]

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=.2)

In [26]:
from sklearn import feature_extraction

vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2), analyzer='char')

In [27]:
from sklearn import pipeline
from sklearn import linear_model
model_pipeline = pipeline.Pipeline([('vec', vec), ('clf',linear_model.LogisticRegression())])

In [28]:
model_pipeline.fit(X_train, y_train)

In [29]:
predict_val = model_pipeline.predict(X_test)

In [30]:
from sklearn import metrics

metrics.accuracy_score(y_test, predict_val)*100

98.06576402321083

In [33]:
model_pipeline.predict(['फातत ोि'])

array(['Hindi'], dtype=object)

In [37]:
import pickle

new_file = open('model.pckl', 'wb')
pickle.dump(model_pipeline,new_file)
new_file.close()