In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,f1_score,recall_score,precision_score
from sklearn.model_selection import cross_val_score
import os
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
path='D:\Data Science\Interview Assignment/dataset.csv'
df=pd.read_csv(path)
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [4]:
df.shape

(22000, 2)

In [5]:
df.isnull().sum()

Text        0
language    0
dtype: int64

In [6]:
df["language"].value_counts()

Pushto        1000
French        1000
Persian       1000
Latin         1000
Russian       1000
Estonian      1000
Japanese      1000
Hindi         1000
Swedish       1000
Indonesian    1000
Romanian      1000
English       1000
Spanish       1000
Thai          1000
Chinese       1000
Turkish       1000
Urdu          1000
Tamil         1000
Portugese     1000
Arabic        1000
Korean        1000
Dutch         1000
Name: language, dtype: int64

In [7]:
unique, counts = np.unique(df['language'], return_counts=True)
unique


array(['Arabic', 'Chinese', 'Dutch', 'English', 'Estonian', 'French',
       'Hindi', 'Indonesian', 'Japanese', 'Korean', 'Latin', 'Persian',
       'Portugese', 'Pushto', 'Romanian', 'Russian', 'Spanish', 'Swedish',
       'Tamil', 'Thai', 'Turkish', 'Urdu'], dtype=object)

### create feature : length of language and length of punctuation

In [8]:
df['length']=df['Text'].apply(lambda x: len(x))

In [9]:
import string
def punctuation_count(txt):
  count=sum(1 for c in txt if c in string.punctuation)
  return count
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [10]:
df['length_punc']=df['Text'].apply((lambda x: punctuation_count(x) ))

In [11]:
df.head()

Unnamed: 0,Text,language,length,length_punc
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian,339,0
1,sebes joseph pereira thomas på eng the jesuit...,Swedish,171,3
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai,251,0
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil,305,3
4,de spons behoort tot het geslacht haliclona en...,Dutch,176,0


### cleaning the text from puncutation and numbers

In [12]:

RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
def strip_punctuation(s):
    return RE_PUNCT.sub(" ", s)

RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
def strip_numeric(s):
    return RE_NUMERIC.sub("", s)

def clean_text(text):
    cleaning_functions = [lambda x: x.lower(), strip_punctuation, strip_numeric]
    for f in cleaning_functions:
        text = f(text)
    return text


In [13]:
sentences = df['Text']
language = df['language']

In [14]:
sentences = [clean_text(sentence) for sentence in sentences]


### splitting data into train 70% and test 30 %

In [15]:
X_train, X_test, y_train, y_test = train_test_split(sentences, 
                                                    language,
                                                    test_size = 0.3,
                                                    random_state = 42,shuffle=True)

### Vectorization using (Tf-idf)


In [16]:
tfidf_vect = TfidfVectorizer( analyzer='char', ngram_range=(1,3), lowercase=False)


 # **building language detection model**

# Randomforest classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

param_grid = { 
    'n_estimators': [20,30,40,50],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
rfc=RandomForestClassifier()
model_rfc= GridSearchCV(rfc, param_grid=param_grid)
text_rfc= Pipeline([('tfidf', tfidf_vect),
                    ('clf', model_rfc)
                    ],verbose=True)
text_rfc.fit(X_train,y_train)

#predictions on test data
predictions_rfc=text_rfc.predict(X_test)

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=  15.1s




[Pipeline] ............... (step 2 of 2) Processing clf, total= 7.0min


In [18]:
print(model_rfc.best_params_)

{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 50}


In [19]:
accuracy_score(y_test,predictions_rfc)

0.9687878787878788

In [20]:
print(classification_report(y_test, predictions_rfc, target_names=unique))

              precision    recall  f1-score   support

      Arabic       0.99      1.00      1.00       300
     Chinese       0.97      0.92      0.95       291
       Dutch       0.96      0.97      0.97       313
     English       0.72      0.99      0.83       289
    Estonian       0.98      0.94      0.96       308
      French       0.96      0.98      0.97       287
       Hindi       1.00      0.98      0.99       314
  Indonesian       0.99      0.95      0.97       295
    Japanese       0.95      0.99      0.97       302
      Korean       1.00      0.98      0.99       296
       Latin       0.98      0.86      0.92       312
     Persian       0.99      0.99      0.99       299
   Portugese       0.96      0.93      0.95       293
      Pushto       1.00      0.96      0.98       303
    Romanian       0.99      0.97      0.98       291
     Russian       0.98      1.00      0.99       302
     Spanish       0.98      0.98      0.98       287
     Swedish       0.99    

In [21]:
f1_score_rfc=f1_score(y_test,predictions_rfc,average='weighted')
precision_score_rfc=precision_score(y_test,predictions_rfc,average='weighted')
recall_score_rfc=recall_score(y_test,predictions_rfc,average='weighted')

# Decision tree

In [22]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()

param_grid =[{'criterion':['gini','entropy'],'max_depth':[1,2,3,4,5,6,7]}]
model_dt = GridSearchCV(dt, param_grid=param_grid)
text_dt = Pipeline([('tfidf', tfidf_vect),
                    ('clf', model_dt),
                    ],verbose=True)
text_dt.fit(X_train,y_train)

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=  14.1s




[Pipeline] ............... (step 2 of 2) Processing clf, total=11.1min


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern=...
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                                               min_samples_leaf=1,
                                         

In [23]:
print(model_dt.best_params_)

{'criterion': 'entropy', 'max_depth': 7}


In [24]:
predictions_dt=text_dt.predict(X_test)
accuracy_score(y_test,predictions_dt)

0.9259090909090909

In [25]:
print(classification_report(y_test, predictions_dt, target_names=unique))

              precision    recall  f1-score   support

      Arabic       0.99      1.00      1.00       300
     Chinese       0.99      0.96      0.98       291
       Dutch       0.74      0.87      0.80       313
     English       0.71      0.90      0.79       289
    Estonian       0.80      0.88      0.84       308
      French       0.93      0.89      0.91       287
       Hindi       0.95      0.98      0.96       314
  Indonesian       0.96      0.89      0.92       295
    Japanese       1.00      0.98      0.99       302
      Korean       1.00      0.96      0.98       296
       Latin       0.66      0.82      0.73       312
     Persian       0.99      0.98      0.98       299
   Portugese       0.97      0.83      0.89       293
      Pushto       0.99      0.94      0.96       303
    Romanian       1.00      0.92      0.96       291
     Russian       0.98      0.99      0.99       302
     Spanish       0.93      0.91      0.92       287
     Swedish       0.98    

In [26]:
f1_score_dt=f1_score(y_test,predictions_rfc,average='weighted')
precision_score_dt=precision_score(y_test,predictions_dt,average='weighted')
recall_score_dt=recall_score(y_test,predictions_dt,average='weighted')

# Naive Bayes

In [27]:
model = MultinomialNB()
text_clf = Pipeline([('tfidf', tfidf_vect),
                    ('clf', model)
                    ],verbose=True)
text_clf.fit(X_train, y_train)

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=  14.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.7s


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=True)

In [28]:
predictions = text_clf.predict(X_test)

In [29]:
accuracy_score(y_test,predictions)

0.9768181818181818

In [30]:
print(classification_report(y_test, predictions, target_names=unique))


              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.99      0.98      0.99       291
       Dutch       0.98      0.97      0.98       313
     English       0.72      1.00      0.84       289
    Estonian       1.00      0.95      0.97       308
      French       0.96      0.99      0.97       287
       Hindi       1.00      0.98      0.99       314
  Indonesian       1.00      0.96      0.98       295
    Japanese       1.00      0.99      0.99       302
      Korean       1.00      0.99      0.99       296
       Latin       0.99      0.90      0.94       312
     Persian       0.99      1.00      0.99       299
   Portugese       0.99      0.94      0.96       293
      Pushto       1.00      0.96      0.98       303
    Romanian       1.00      0.98      0.99       291
     Russian       0.99      1.00      0.99       302
     Spanish       0.98      0.99      0.98       287
     Swedish       1.00    

In [31]:
f1_score_nb=f1_score(y_test,predictions,average='weighted')
precision_score_nb=precision_score(y_test,predictions,average='weighted')
recall_score_nb=recall_score(y_test,predictions,average='weighted')

# kNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier

#define the model and parameters
knn = KNeighborsClassifier()

parameters = {'n_neighbors':[3,4,5,6,7]}

#Fit the model
model_knn = GridSearchCV(knn, param_grid=parameters)
text_knn = Pipeline([('tfidf', tfidf_vect),
                    ('clf', model_knn),
                    ],verbose=True)
text_knn.fit(X_train,y_train)

#predictions on test data
predictions_knn=text_knn.predict(X_test)

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=  14.0s




[Pipeline] ............... (step 2 of 2) Processing clf, total= 2.3min


In [33]:
print(model_knn.best_params_)

{'n_neighbors': 3}


In [34]:
accuracy_score(y_test,predictions_knn)

0.9768181818181818

In [35]:
print(classification_report(y_test, predictions_knn, target_names=unique))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.99      0.97      0.98       291
       Dutch       0.97      0.98      0.98       313
     English       0.77      0.98      0.86       289
    Estonian       0.99      0.94      0.96       308
      French       0.96      0.98      0.97       287
       Hindi       1.00      0.98      0.99       314
  Indonesian       0.98      0.99      0.98       295
    Japanese       0.99      0.98      0.98       302
      Korean       1.00      0.99      0.99       296
       Latin       0.97      0.91      0.94       312
     Persian       0.99      1.00      0.99       299
   Portugese       0.98      0.96      0.97       293
      Pushto       0.99      0.97      0.98       303
    Romanian       1.00      0.97      0.99       291
     Russian       0.98      1.00      0.99       302
     Spanish       0.97      0.99      0.98       287
     Swedish       1.00    

In [36]:
f1_score_knn=f1_score(y_test,predictions_knn,average='weighted')
precision_score_knn=precision_score(y_test,predictions_knn,average='weighted')
recall_score_knn=recall_score(y_test,predictions_knn,average='weighted')

# logestic_regression

In [37]:
from sklearn.linear_model import LogisticRegression
param_grid={'C': [0.01, 0.1, 1, 10] }
logis_model=LogisticRegression(multi_class='multinomial',solver='lbfgs')
model_logis=GridSearchCV(logis_model, param_grid=param_grid)
text_logis = Pipeline([('tfidf', tfidf_vect),
                    ('clf', model_logis),
                    ],verbose=True)
text_logis.fit(X_train,y_train)

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=  14.0s




[Pipeline] ............... (step 2 of 2) Processing clf, total=36.8min




Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern=...
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None

In [38]:
print(model_logis.best_params_)

{'C': 10}


In [39]:
predictions_logis=text_logis.predict(X_test)

In [40]:
accuracy_score(y_test,predictions_logis)


0.9834848484848485

In [41]:
print(classification_report(y_test, predictions_logis, target_names=unique))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.99      0.99      0.99       291
       Dutch       0.98      0.98      0.98       313
     English       0.86      0.98      0.91       289
    Estonian       0.99      0.96      0.98       308
      French       0.96      0.99      0.97       287
       Hindi       1.00      0.98      0.99       314
  Indonesian       0.99      0.99      0.99       295
    Japanese       1.00      0.99      1.00       302
      Korean       1.00      0.99      0.99       296
       Latin       0.95      0.95      0.95       312
     Persian       1.00      1.00      1.00       299
   Portugese       0.98      0.98      0.98       293
      Pushto       1.00      0.97      0.98       303
    Romanian       1.00      0.98      0.99       291
     Russian       0.98      1.00      0.99       302
     Spanish       0.98      0.98      0.98       287
     Swedish       1.00    

In [42]:
f1_score_logis=f1_score(y_test,predictions_logis,average='weighted')
precision_score_logis=precision_score(y_test,predictions_logis,average='weighted')
recall_score_logis=recall_score(y_test,predictions_logis,average='weighted')

## support vector machine 

In [43]:
from sklearn import svm
model_svm = OneVsRestClassifier(svm.SVC(gamma=0.01, C=10., probability=True, class_weight='balanced', kernel='linear'))
text_svm = Pipeline([('tfidf', tfidf_vect),
                    ('clf', model_svm),
                    ],verbose=True)
text_svm.fit(X_train,y_train)


[Pipeline] ............. (step 1 of 2) Processing tfidf, total=  14.7s
[Pipeline] ............... (step 2 of 2) Processing clf, total=27.5min


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 OneVsRestClassifier(estimator=SVC(C=10.0, cache_size=200,
                                                   class_weight='balanc

In [44]:
predictions_svm=text_svm.predict(X_test)

In [45]:
accuracy_score(y_test,predictions_svm)


0.9860606060606061

In [46]:
print(classification_report(y_test, predictions_svm, target_names=unique))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       300
     Chinese       0.99      0.99      0.99       291
       Dutch       0.99      0.98      0.99       313
     English       0.89      0.99      0.93       289
    Estonian       0.99      0.97      0.98       308
      French       0.97      0.98      0.98       287
       Hindi       1.00      0.99      0.99       314
  Indonesian       0.99      0.99      0.99       295
    Japanese       1.00      0.99      1.00       302
      Korean       1.00      0.99      1.00       296
       Latin       0.96      0.96      0.96       312
     Persian       1.00      1.00      1.00       299
   Portugese       0.97      0.99      0.98       293
      Pushto       1.00      0.97      0.99       303
    Romanian       1.00      0.98      0.99       291
     Russian       0.98      1.00      0.99       302
     Spanish       0.98      0.98      0.98       287
     Swedish       1.00    

In [47]:
f1_score_svm=f1_score(y_test,predictions_svm,average='weighted')
precision_score_svm=precision_score(y_test,predictions_svm,average='weighted')
recall_score_svm=recall_score(y_test,predictions_svm,average='weighted')

## Show  Tthe  Algorithm’s Accuracy.

In [48]:
list_precision=[precision_score_rfc,precision_score_dt,precision_score_nb,precision_score_knn,precision_score_logis,precision_score_svm]
list_recall=[recall_score_rfc,recall_score_dt,recall_score_nb,recall_score_knn,recall_score_logis,recall_score_svm]
list_f1=[f1_score_rfc,f1_score_dt,f1_score_nb,f1_score_knn,f1_score_logis,f1_score_svm]


df_1=pd.DataFrame(list_precision, index=['Random Forest','Decicion Tree','Naive_Bayes','KNN','logistic','Svm'])
df_1.columns =['precision']
df_1.insert(loc=1,column='recall',value=list_recall)
df_1.insert(loc=2,column='f1_score',value=list_f1)
df_1.columns.name = 'Algorithm'
df_1

Algorithm,precision,recall,f1_score
Random Forest,0.973414,0.968788,0.969774
Decicion Tree,0.935032,0.925909,0.969774
Naive_Bayes,0.981436,0.976818,0.977937
KNN,0.979585,0.976818,0.977473
logistic,0.984473,0.983485,0.983733
Svm,0.986627,0.986061,0.986195


## Make a predictions with Naive Bayes Model

In [70]:
text_clf.predict(['Dell is an American multinational computer technology company'])[0]

'English'

In [71]:
text_clf.predict(['戴爾是美國跨國計算機技術公司'])[0]

'Chinese'

In [72]:
text_clf.predict(['Dell é uma empresa multinacional americana de tecnologia da computação'])[0]

'Portugese'

In [73]:
text_clf.predict(['डेल एक अमेरिकी बहुराष्ट्रीय कंप्यूटर प्रौद्योगिकी कंपनी है'])[0]

'Hindi'

In [74]:
text_clf.predict(['Dell - американская транснациональная компьютерная компания.'])[0]

'Russian'

In [75]:
text_clf.predict(['Dell은 미국의 다국적 컴퓨터 기술 회사입니다.'])[0]

'Korean'

In [76]:
text_clf.predict(['Dell är ett amerikanskt multinationellt datateknikföretag'])[0]

'Swedish'