# **Text Language Identifier**
###**Problem Statement** : To determine in which language a particular text is written.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#importing all the required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
#Reading the CSV file
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP dataset.csv')

In [4]:
#Displaying top 5 rows
df.head()

Unnamed: 0,Text,Language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [5]:
#Displaying the shpae i.e no.of.rows and no.of.columns
df.shape

(22000, 2)

In [6]:
#to check the value count of each language
df['Language'].value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: Language, dtype: int64

In [7]:
#Storing Text column in X variable and Language column in Y variable
X = df['Text']
Y = df['Language']

In [8]:
#Importing LabelEncoder library to encode Y variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)

In [9]:
#Data cleaning and preprocessing
data_list = []
for text in X:
  text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
  text = re.sub(r'[[]]', ' ', text)
  text = text.lower()
  data_list.append(text)

In [10]:
#Vectorizing the text into numerical form with Tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
X = tf.fit_transform(data_list)

In [11]:
print(X)

  (0, 36177)	0.15624603541823798
  (0, 97660)	0.15624603541823798
  (0, 59002)	0.14235402908773076
  (0, 38330)	0.15624603541823798
  (0, 5664)	0.060484346258163214
  (0, 90335)	0.1213362348760253
  (0, 36176)	0.15624603541823798
  (0, 42053)	0.18834225986691777
  (0, 133)	0.22090593953660845
  (0, 22368)	0.2002602023544721
  (0, 45199)	0.13525864049253178
  (0, 51274)	0.1442905517241004
  (0, 97659)	0.15624603541823798
  (0, 48569)	0.15624603541823798
  (0, 98052)	0.10370154988897431
  (0, 54890)	0.15624603541823798
  (0, 83108)	0.05938820154366916
  (0, 93510)	0.13725272850857811
  (0, 118)	0.08720409882997852
  (0, 20129)	0.15624603541823798
  (0, 88842)	0.08398142638013377
  (0, 27232)	0.13184513198187803
  (0, 49938)	0.1124247226389579
  (0, 41921)	0.15624603541823798
  (0, 38021)	0.12569782125101708
  :	:
  (21999, 55959)	0.1931570365935756
  (21999, 28887)	0.1846507124267984
  (21999, 81064)	0.1931570365935756
  (21999, 4546)	0.17010905521579694
  (21999, 33488)	0.16160273104901

In [13]:
#splitting the data of X and Y variable into training data and testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)

##Model-1 Logistic Regression

In [14]:
#Importing 1st classification algorithm & modelling it with training data
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(x_train,y_train)

LogisticRegression()

In [15]:
#predicting 
y1_pred = model1.predict(x_test)

In [16]:
y1_pred

array([ 7, 13, 20, ..., 18, 16, 20])

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y1_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       204
           1       0.54      0.98      0.70       193
           2       1.00      0.97      0.98       224
           3       0.81      0.96      0.88       193
           4       0.98      0.95      0.97       205
           5       0.97      0.97      0.97       179
           6       1.00      0.96      0.98       195
           7       1.00      0.95      0.98       196
           8       0.95      0.59      0.73       221
           9       1.00      0.97      0.98       203
          10       0.97      0.94      0.95       211
          11       1.00      0.96      0.98       226
          12       0.98      0.96      0.97       203
          13       1.00      0.93      0.96       206
          14       1.00      0.99      0.99       198
          15       1.00      0.96      0.98       181
          16       0.96      0.96      0.96       162
          17       1.00    

###Observation : The accuracy we are getting with Logistic Regression is approximately 95%. 

##Model - 2 Decision Tree Classifier

In [20]:
#Importing 2st classification algorithm & modelling it with training data
from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier()
model2.fit(x_train,y_train)

DecisionTreeClassifier()

In [21]:
#predicting
y2_pred = model2.predict(x_test)

In [22]:
y2_pred

array([ 7, 13, 16, ..., 18, 16, 20])

In [23]:
print(classification_report(y_test,y2_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       204
           1       0.73      0.41      0.53       193
           2       0.43      0.92      0.59       224
           3       0.82      0.87      0.85       193
           4       0.78      0.85      0.81       205
           5       0.91      0.89      0.90       179
           6       0.98      0.96      0.97       195
           7       0.95      0.88      0.92       196
           8       0.91      0.45      0.60       221
           9       0.99      0.92      0.95       203
          10       0.84      0.85      0.84       211
          11       1.00      0.98      0.99       226
          12       0.91      0.89      0.90       203
          13       0.99      0.95      0.97       206
          14       0.92      0.90      0.91       198
          15       1.00      0.90      0.95       181
          16       0.80      0.86      0.83       162
          17       0.99    

###Observation : The accuracy we are getting with Decision Tree Classifier is approximately 87%.

##Model - 3 Naive Bayes

In [24]:
#Importing 3rd classification algorithm & modelling it with training data
from sklearn.naive_bayes import MultinomialNB
model3 = MultinomialNB()
model3.fit(x_train, y_train)

MultinomialNB()

In [26]:
#Predicting
y3_pred = model3.predict(x_test)

In [27]:
y3_pred

array([ 7, 13, 20, ..., 18, 16, 20])

In [35]:
print(classification_report(y_test,y3_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       204
           1       0.97      0.48      0.64       193
           2       0.97      0.98      0.98       224
           3       0.64      1.00      0.78       193
           4       0.98      0.95      0.97       205
           5       0.91      0.99      0.95       179
           6       0.99      0.97      0.98       195
           7       0.97      0.96      0.97       196
           8       0.99      0.57      0.72       221
           9       1.00      0.98      0.99       203
          10       0.94      0.93      0.94       211
          11       1.00      1.00      1.00       226
          12       0.98      0.93      0.96       203
          13       0.99      0.95      0.97       206
          14       1.00      0.99      1.00       198
          15       0.99      0.99      0.99       181
          16       0.55      0.98      0.70       162
          17       1.00    

###Observation : The accuracy we are getting with Naive Bayes is approximately 93%

## Model - 4 KNearestNeighbors

In [36]:
#Importing 4th classification algorithm & modelling it with training data
from sklearn.neighbors import KNeighborsClassifier
model4 = KNeighborsClassifier()
model4.fit(x_train,y_train)

KNeighborsClassifier()

In [37]:
#Predicting
y4_pred = model4.predict(x_test)

In [38]:
y4_pred

array([ 7, 13, 20, ..., 18, 16, 20])

In [39]:
print(classification_report(y_test,y4_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       204
           1       0.79      0.69      0.73       193
           2       0.84      1.00      0.91       224
           3       0.76      0.98      0.86       193
           4       0.98      0.93      0.95       205
           5       0.96      0.98      0.97       179
           6       0.99      0.97      0.98       195
           7       0.97      0.96      0.97       196
           8       0.99      0.47      0.64       221
           9       0.72      0.97      0.83       203
          10       0.93      0.93      0.93       211
          11       1.00      1.00      1.00       226
          12       0.98      0.97      0.97       203
          13       0.98      0.95      0.97       206
          14       0.99      0.98      0.99       198
          15       0.99      0.99      0.99       181
          16       0.96      0.96      0.96       162
          17       0.97    

####Observation : The accuracy we are getting with KNearestNeighbor is approximately 94%.

## Model - 5 Random Forest

In [40]:
##Importing 5th classification algorithm & modelling it with training data
from sklearn.ensemble import RandomForestClassifier
model5 = RandomForestClassifier(n_estimators = 70,min_samples_leaf=30,oob_score = True,random_state = 0)
model5.fit(x_train,y_train)

RandomForestClassifier(min_samples_leaf=30, n_estimators=70, oob_score=True,
                       random_state=0)

In [41]:
y5_pred = model5.predict(x_test)

In [42]:
y5_pred

array([ 7, 13, 20, ..., 18, 16, 20])

In [44]:
print(classification_report(y_test,y5_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       204
           1       0.39      0.92      0.54       193
           2       0.97      0.96      0.97       224
           3       0.70      0.93      0.80       193
           4       0.97      0.84      0.90       205
           5       0.87      0.96      0.91       179
           6       1.00      0.96      0.98       195
           7       0.98      0.96      0.97       196
           8       1.00      0.15      0.27       221
           9       1.00      0.82      0.90       203
          10       0.95      0.78      0.85       211
          11       1.00      0.97      0.98       226
          12       0.92      0.90      0.91       203
          13       1.00      0.93      0.96       206
          14       0.95      0.99      0.97       198
          15       1.00      0.89      0.94       181
          16       0.92      0.95      0.94       162
          17       0.99    

###Observation : The accuracy we are getting with Random Forest is approximately 90%.

## Model - 6 SVM (Support Vector Machines)

In [45]:
#Importing 6th classification algorithm & modelling it with training data
from sklearn.svm import SVC
model6 = SVC(kernel = 'linear',C = 0.025,random_state = 0)
model6.fit(x_train,y_train)

SVC(C=0.025, kernel='linear', random_state=0)

In [46]:
y6_pred = model6.predict(x_test)

In [47]:
y6_pred

array([ 7, 15, 20, ..., 18, 16, 20])

In [48]:
print(classification_report(y_test,y6_pred))

              precision    recall  f1-score   support

           0       1.00      0.65      0.79       204
           1       0.00      0.01      0.00       193
           2       1.00      0.82      0.90       224
           3       0.86      0.80      0.83       193
           4       1.00      0.00      0.01       205
           5       1.00      0.82      0.90       179
           6       1.00      0.81      0.90       195
           7       1.00      0.72      0.84       196
           8       0.00      0.00      0.00       221
           9       0.00      0.00      0.00       203
          10       1.00      0.19      0.32       211
          11       1.00      0.86      0.93       226
          12       0.99      0.76      0.86       203
          13       1.00      0.79      0.88       206
          14       1.00      0.67      0.80       198
          15       0.13      1.00      0.23       181
          16       1.00      0.84      0.91       162
          17       1.00    

###Observation : The accuracy we are getting with Support Vector Machine is approximately 58%.

#Conclusion

- Accuracy of Logistic Regression is 95%
- Accuracy of Decision Tree Classifier is 87%
- Accuracy of Naive Bayes is 93%
- Accuracy of K-Nearest Neighbor is 94%
- Accuracy of Random Forest is 90%
- Accuracy of Support vector Machine is 58%

###Cleary, We can tell that Logistic Regression, Naive Bayes, K-Nearest Neighbor and Random Forest these all algorithms are performing well into classifying language based on text.

In [54]:
def predict(text):
  trans = tf.transform([text])

  lang1 = model1.predict(trans)
  lang1 = le.inverse_transform(lang1)
  print("Prediction using Logistic Regression - The langauge is in",lang1[0])

  lang3 = model3.predict(trans)
  lang3 = le.inverse_transform(lang3)
  print("Prediction using Naive - Bayes - The langauge is in",lang3[0])

  lang4 = model4.predict(trans)
  lang4 = le.inverse_transform(lang4)
  print("Prediction using K-Nearest Neighbor - The langauge is in",lang4[0])
  
  lang5 = model5.predict(trans)
  lang5 = le.inverse_transform(lang5)
  print("Prediction using Random Forest - The langauge is in",lang5[0])
  

In [58]:
predict('''A paragraph is a collection of words strung together to make a 
longer unit than a sentence. Several sentences often make to a paragraph. 
There are normally three to eight sentences in a paragraph. Paragraphs can 
start with a five-space indentation or by skipping a line and then starting over.''') #English

Prediction using Logistic Regression - The langauge is in English
Prediction using Naive - Bayes - The langauge is in English
Prediction using K-Nearest Neighbor - The langauge is in English
Prediction using Random Forest - The langauge is in English


In [59]:
predict('''இன்றைய நாகரிக வாழ்வில் விவசாய நிலங்களும் விவசாயிகளும் குறைந்து வருகின்றனர் ,
இதன் காரணமாக குறைந்த இடத்தில அதிக மகசூல் கொடுக்கும் முறைகளை கையாள வேண்டிய நிர்பந்தத்திற்கு
உள்ளாகி உள்ளனர்.இதன் காரணமாக அதிக மகசூல் முறைகளான செயற்கை உரமிடுதல் ,செயற்கை பூச்சி கொல்லிமருந்துகள்
பயன்படுத்துவது சாதாரணமாக ஆகிவிட்டது.செயற்கை ரசாயனங்கள் தங்கள் வேலைகளை செய்வதோடு உணவு விளைபொருள்களிலும்
இணைந்து நம்மை வந்தடைகிறது.''') #Tamil

Prediction using Logistic Regression - The langauge is in Tamil
Prediction using Naive - Bayes - The langauge is in Tamil
Prediction using K-Nearest Neighbor - The langauge is in Tamil
Prediction using Random Forest - The langauge is in Tamil


In [60]:
predict('''有它本来在夏威夷语的含义，但这里指的应该是一个小的知识“块”，而“基”字可以作基石讲，
所以我觉得“基”可以接受。“维”字，本意“系物的大绳” （《高级汉语大词典》），也可以作“网”讲（《集韵》：维,网也），
所以“维”字也是可以接受的。可以参照的一个译法是 ''') #Chinese

Prediction using Logistic Regression - The langauge is in Chinese
Prediction using Naive - Bayes - The langauge is in Chinese
Prediction using K-Nearest Neighbor - The langauge is in Chinese
Prediction using Random Forest - The langauge is in Chinese


In [61]:
predict('''いや、そのような法律はないよ。日本の国語は日本語であるとか、公用語は日本語であるなどと定める条文はないんだ。ただ、法律上「国語」という用例はあるし、
そのような条文は日本語のことを国語といっているのだと理解できるよ。例えば、小学校などで義務教育として行われる普通教育では「読書に親しませ、
生活に必要な国語を正しく理解し、使用する基礎的な能力を養うこと」（学校教育法（昭和22年法律第26号）第21条第５号）が、その教育の目標とされているけれど、
この「国語」は日本語のことでしょう。「国語＝日本語」という一対一対応の関係は、国内の少数言語の話し手や複数の公用語が使用されている国の人から見れば、
自明のこととはいえないのだろうけれど。法律上「日本語」という用語を使うのは、例えば「日本語教育の推進に関する法律」（令和元年法律第48号）のように、外国人が、あるいは外国人も、
用いることを想定しているような場合が多いかもしれないね   ''')   #Japanese

Prediction using Logistic Regression - The langauge is in Japanese
Prediction using Naive - Bayes - The langauge is in Japanese
Prediction using K-Nearest Neighbor - The langauge is in Japanese
Prediction using Random Forest - The langauge is in Chinese


In [62]:
predict(''' Nasafjällsverket har fått dåligt eftermäle eftersom verksamheten under den första perioden bedrevs med tvångsrekryterad arbetskraft. 
Själva gruvbrytningen utfördes till stor del av män från kustsocknarna som uttagits till krigstjänst. Malmtransporterna sköttes av samer med rendragna ackjor.
 De fick ersättning för arbetet, men kunde inte själva välja om de skulle utföra skjutsarna. Renskötseln blev lidande och många samer lämnade regionen, 
 vilket ledde till att de som blev kvar fick tjänstgöra i allt längre perioder och utsattes för allt hårdare press. Många är berättelserna om de hårda 
 bestraffningar och tvångsåtgärder som samerna utsattes för med anledning av Nasafjälls silververk. Det finns dock få belägg för vilka åtgärder som faktiskt 
 vidtogs''')   #Swedish

Prediction using Logistic Regression - The langauge is in Swedish
Prediction using Naive - Bayes - The langauge is in Swedish
Prediction using K-Nearest Neighbor - The langauge is in Swedish
Prediction using Random Forest - The langauge is in Swedish


In [63]:
predict('''Современная католическая экклезиология, основываясь на текстах Нового Завета и раннехристианского Предания, называет преемников апостола 
Петра на римском престоле «наместниками Христа». Католическую претензию на божественную обусловленность первенства Рима отрицают все прочие христианские 
конфессии. В некоторых восточных церквях за папой римским признаётся только почётное право быть «первым среди равных», не считая, что это даёт ему какую-либо
 власть. В других церквях признаётся вытекающая из этого права власть выражать, олицетворять и осуществлять в одном епископе власть всей церкви. 
 Римско-католическая церковь считает примат папы «полной, высшей и всеобщей осуществляемой беспрепятственно властью над всей Церковью». Эта власть также 
 принадлежит коллегии епископов, находящейся в единении с папой.  ''')   #Russian

Prediction using Logistic Regression - The langauge is in Russian
Prediction using Naive - Bayes - The langauge is in Russian
Prediction using K-Nearest Neighbor - The langauge is in Russian
Prediction using Random Forest - The langauge is in Russian


In [64]:
predict('''Primul Mare Premiu al Campionatului Mondial s-a ținut în 1950 la Silverstone. De atunci, 73 de circuite au mai găzduit Mari Premii. 
Mai multe circuite cum ar fi Nürburgring, au găzduit Mari Premii utilizând diferite configurații. Prima cursă la Nürburgring a folosit configurația de 
22,835 km, dar îngrijorările legate de siguranță au rezultat ca Marile Premii recente să folosească un circuit mai scurt, dar mai sigur. 
În primii ani ai campionatului de Formula 1, circuitele se aflau predominant în Europa. Pe măsură ce sportul și-a extins aria de practicare, s-au înmulțit
 și circuitele în întreaga lume. Extinderea în Asia și America este un fenomen mai recent. Din cele 20 de circuite care găzduiesc Mari Premii în 2021,
  aproape jumătate nu au existat pe calendarul F1 înainte de 1999. ''')    #Romanian

Prediction using Logistic Regression - The langauge is in Romanian
Prediction using Naive - Bayes - The langauge is in Romanian
Prediction using K-Nearest Neighbor - The langauge is in Romanian
Prediction using Random Forest - The langauge is in Romanian


In [65]:
predict('''ग्लेशियर नेशनल पार्क अमेरिकी राष्ट्रीय उद्यान है, जो कि कनाडा-संयुक्त राज्य अमेरिका की सीमा पर स्थित है। उद्यान संयुक्त राज्य के उत्तर-पश्चिमी मोंटाना राज्य में स्थित है और कनाडा
 की ओर अल्बर्टा और ब्रिटिश कोलम्बिया प्रांतों से सटा हुआ है। उद्यान दस लाख एकड़ (4,000 किमी2) से अधिक क्षेत्र में फैला हुआ है और इसमें दो पर्वत श्रृंखला (रॉकी पर्वत की उप-श्रेणियाँ),
  130 से अधिक नामित झीलें, 1,000 से अधिक विभिन्न पौधों की प्रजातियां और सैकड़ों वन्यजीवों की प्रजातियां शामिल हैं। इस विशाल प्राचीन पारिस्थितिकी तंत्र को जो कि 16,000 वर्ग मील (41,000 किमी2)
   में शामिल संरक्षित भूमि का भाग है, "क्राउन ऑफ़ द कॉन्टिनेंट इकोसिस्टम" के रूप में संदर्भित किया गया है ''')    #Hindi

Prediction using Logistic Regression - The langauge is in Hindi
Prediction using Naive - Bayes - The langauge is in Hindi
Prediction using K-Nearest Neighbor - The langauge is in Hindi
Prediction using Random Forest - The langauge is in Hindi


In [66]:
predict('''کراچی،         معروف آن لائن انسائیکلوپیڈیا 'وکی پیڈیا' کے پاکستانی صارفین کا ایک اجلاس اتوار کو کراچی میں منعقد ہوا جس میں پاکستان میں وکی پیڈیا کے ذریعے معلومات کے آزادانہ پھیلاو اور اردو و مقامی زبانوں میں معلومات کی زیادہ سے زیادہ فراہمی کے عمل کا جائزہ لیا گیا اور اسے مزید بہتر بنانے کے لیے تجاویز اکٹھی کی گئیں۔ اجلاس میں انگریزی وکی پیڈیا پر پاکستانی صارفین کو متحرک کرنے، مضامین میں تخریب کاری کو روکنے، اردو وکی پیڈیا کی ترویج و تشہیر، آن لائن کمیونٹی کو وکی پیڈیا کی جانب راغب کرنے کے لیے
 اقدامات تجویز کیے گئے اور ان پر عمل کے لیے لائحہ عمل طے کیا گیا۔ اس موقع پر اجلاس کے لیے وکی پیڈیا کے بانی اور وکی میڈیا فاونڈیشن کے سابق چیئرمین جمبو ویلز 
  کا خصوصی پیغام بھی پڑھ کر سنایا گیا جس میں انہوں نے نیک خواہشات کے اظہار کے ساتھ اردو وکیپیڈیا پر خصوصی توجہ رکھنے پر زور دیا۔ اجلاس میں پاکستان کے حوالے سے مضامین تحریر کرنے، موجودہ مضامین میں اضافے، تصاویر کی فراہمی، وکیپیڈیا کی 
  مناسب تشہیر اور انگریزی مضامین کے اردو ترجمے کے لیے کام کرنے
   کا فیصلہ کیا گیا۔ اجلاس میں ثاقب قیوم چودھری، رابعہ ظفر، فہد کیہر، محمد علی مکی، احمد ندیم اعوان اور جمال عبد اللہ عثمان نے شرکت کی۔''')  #Urdu

Prediction using Logistic Regression - The langauge is in Urdu
Prediction using Naive - Bayes - The langauge is in Urdu
Prediction using K-Nearest Neighbor - The langauge is in Urdu
Prediction using Random Forest - The langauge is in Urdu


#**The End**