In [1]:
#importing libraries
import string
import pandas as pd
import numpy as np

In [2]:
#dataset from kaggle (https://www.kaggle.com/code/martinkk5575/language-detection/data?select=dataset.csv)
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [3]:
#function to remove special character (string.punctuation) using string method
def removeSC (text):
    for SC in string.punctuation:
        text = text.replace(SC,"")
    text = text.lower()
    return(text)

In [4]:
#Example
removeSC(" 'Life'can be refer to as the phenomena of the: ! @ .py ,.;:[]")

' lifecan be refer to as the phenomena of the   py '

In [5]:
#removing special character from df
df["Text"] = df["Text"].apply(removeSC)

In [6]:
#Change typo
df = df.replace(to_replace ="Portugese",value ="Portuguese")

In [7]:
df.shape

(22000, 2)

In [8]:
df.language.value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portuguese    1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: language, dtype: int64

In [9]:
#Separating Dependent and Independent Variable
x = df.iloc[:,0]
y = df.iloc[:,1]

In [10]:
#Splitting data into training set and test set
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = .2, random_state = 0)

In [11]:
xTrain

21831    傳真社發表相關調查報道後，運輸及房屋局局長張炳良否認知情。不過傳真社在月日引述政府消息人士所...
2569     industria una ferrería que solo trabaja en tie...
2976     أسس راميرو تحالف مملكتي نافارا وليون الذي استط...
18102    het olympic park werd na afloop van de spelen ...
5168     wilcoxius truncus is een vliegensoort uit de f...
                               ...                        
13123    يعد يلستون أكبر النظم الايكولوجية والبيئة من ج...
19648    tftp aktarım protokolü olarak genellikle udp k...
9845     اعترض اعضاء مجلس النواب على قرار الحل فتدخل ال...
10799    după  de treceri la periheliu cometa pierde ga...
2732     بربینک کیلیفورنیا کا رقبہ  مربع کیلومیٹر ہے او...
Name: Text, Length: 17600, dtype: object

In [12]:
xTest

19654    numerabantur octo collegia duodeviginti refugi...
7261     încleștarea regilor preia acțiunea de unde a l...
3394     jackson soloist  ağustos  tarihinden beri üret...
21584    el artículo  del código penal de la república ...
4741     alfred frenzel jablonec nad nisou  de setembro...
                               ...                        
2839     mahallede ilköğretim okulu vardır mahallenin i...
14216    作為資本主義的始祖國，英國對上述“固定電價”制度有難以接受的心理障礙，所以從年來苦心積慮地設...
12631    อุทยานแห่งชาติน้ำตกโยง ตั้งอยู่บนคาบสมุทรทำให้...
9978     اب میں تم سے ایک سوال کرتا ہوں کہ کیا تم لوگ ا...
19839    jabal ţār al qidr är ett berg i jordanien det ...
Name: Text, Length: 4400, dtype: object

In [13]:
yTrain

21831     Chinese
2569      Spanish
2976       Arabic
18102       Dutch
5168        Dutch
           ...   
13123      Arabic
19648     Turkish
9845       Arabic
10799    Romanian
2732         Urdu
Name: language, Length: 17600, dtype: object

In [14]:
yTest

19654         Latin
7261       Romanian
3394        Turkish
21584       Spanish
4741     Portuguese
            ...    
2839        Turkish
14216       Chinese
12631          Thai
9978           Urdu
19839       Swedish
Name: language, Length: 4400, dtype: object

In [15]:
#Define Feature extraction
from sklearn import feature_extraction
Vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,1), analyzer="char")

In [16]:
#pipeline(model) definition
from sklearn import pipeline
from sklearn import linear_model
Logistic = linear_model.LogisticRegression(max_iter= 10000)
#Extracting feature (vec) and classifier (clf)
Model = pipeline.Pipeline([('vec', Vec), ('clf', Logistic)])

In [17]:
#Fitting model to training data
Model.fit(xTrain, yTrain)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char')),
                ('clf', LogisticRegression(max_iter=10000))])

In [18]:
#Predicting test data
yPred = Model.predict(xTest)

In [19]:
#Checking score
Model.score(xTest, yTest)

0.9686363636363636

In [20]:
#Given data and predicted data
test = pd.DataFrame({"Actual": yTest, "Predicted": yPred})
test

Unnamed: 0,Actual,Predicted
19654,Latin,Latin
7261,Romanian,Romanian
3394,Turkish,Turkish
21584,Spanish,Spanish
4741,Portuguese,Portuguese
...,...,...
2839,Turkish,Turkish
14216,Chinese,Chinese
12631,Thai,Thai
9978,Urdu,Urdu


Testing

In [21]:
Model.predict(["傳真社發表相關調查報道後"])

array(['Chinese'], dtype=object)

In [22]:
Model.predict(["thank you for this fantastic dataset!"])

array(['English'], dtype=object)

In [23]:
Model.predict(["Nama aku Michael"])

array(['Indonesian'], dtype=object)

In [24]:
Model.predict(["klement gottwaldi surnukeha palsameeriti"])

array(['Estonian'], dtype=object)

In [25]:
Model.predict(["Selamat Malam semuanya"])

array(['Indonesian'], dtype=object)

In [26]:
import pickle
new_file = open("NLP model.pkl","wb")
pickle.dump(Model,new_file)
new_file.close()

In [26]:
with open("NLP model.pkl", "rb") as file:
    nlpmodel = pickle.load(file)

In [27]:
nlpmodel.predict(["Selamat Malam semuanya"])

array(['Indonesian'], dtype=object)