In [1]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

In [2]:
df = pd.read_csv('LangDec.csv')

In [3]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
df.shape

(10337, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [6]:
df['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [7]:
import spacy

nlp = spacy.load('en_core_web_lg')

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)


preprocess('loki is eating pizza')

'loki eat pizza'

In [8]:
df['Text'] = df['Text'].apply(preprocess)

In [9]:
df.Text

0          Nature broad sense natural physical material...
1        nature refer phenomenon physical world life ge...
2                               study nature large science
3        human nature human activity understand separat...
4        1 word nature borrow Old french nature derive ...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ್...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = df.iloc[:,0]
y = df.iloc[:,1]

In [12]:
X

0          Nature broad sense natural physical material...
1        nature refer phenomenon physical world life ge...
2                               study nature large science
3        human nature human activity understand separat...
4        1 word nature borrow Old french nature derive ...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ್...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [13]:
y

0        English
1        English
2        English
3        English
4        English
          ...   
10332    Kannada
10333    Kannada
10334    Kannada
10335    Kannada
10336    Kannada
Name: Language, Length: 10337, dtype: object

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vc = TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lg = LogisticRegression()

In [19]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vc', TfidfVectorizer()),
    ('lg', LogisticRegression())
    
])

In [None]:
clf.fit(X_train,y_train)

In [None]:
y_pred=clf.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
metrics.accuracy_score(y_test,y_pred)*100

In [31]:
clf.predict(['My Name is JAY'])

array(['English'], dtype=object)

In [30]:
clf.predict(['أنا جاي'])

array(['Arabic'], dtype=object)

In [33]:
import pickle

In [34]:
new_file = open('model.pkl','wb')
pickle.dump(clf,new_file)
new_file.close()

In [35]:
import os