In [3]:
import string
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pyplot
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import pipeline
from sklearn.model_selection import train_test_split as t_split
from sklearn import metrics

In [26]:
df = pd.read_csv(r'./Language Detection.csv')
eng_df = df.loc[df['Language'] == 'English']
eng_df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [28]:
print(len(eng_df), len(df))

1385 10337


In [29]:
german_df = df.loc[df['Language'] == 'German']
german_df.head()

Unnamed: 0,Text,Language
9498,.Wir sind alle auf der Suche nach schnellen We...,German
9499,"Ich weiß, ich verstehe, was passiert, ist, das...",German
9500,Wie wäre es also mit uns?,German
9501,Lernen Sie einige intelligente neue englische ...,German
9502,Also werden wir heute 10 neue intelligente Wör...,German


In [31]:
french_df = df.loc[df['Language'] == 'French']
french_df.head()

Unnamed: 0,Text,Language
3250,Si vous disposez d'ouvrages ou d'articles de r...,French
3251,Comment ajouter mes sources ?,French
3252,Cette page ou section est en train d'être trad...,French
3253,Vous pouvez aider au développement de Wikipédi...,French
3254,Le mot nature est un terme polysémique (c’est-...,French


In [32]:
spanish_df = df.loc[df['Language'] == 'Spanish']
italian_df = df.loc[df['Language'] == 'Italian']

In [33]:
# Following are all the symbols that we do not need since they don't contribute to the meaning or
# the semantics of the language

for char in string.punctuation:
    print(char, end=" ")
translate_table = dict((ord(char), None) for char in string.punctuation)

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [49]:
# Text Pre-processing steps:

data_eng = []
lang_eng = []

for i, line in eng_df.iterrows():
    line = line['Text']
    print(i, line)
    if len(line) != 0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(translate_table)
        data_eng.append(line)
        lang_eng.append("English")

0  Nature, in the broadest sense, is the natural, physical, material world or universe.
1 "Nature" can refer to the phenomena of the physical world, and also to life in general.
2 The study of nature is a large, if not the only, part of science.
3 Although humans are part of nature, human activity is often understood as a separate category from other natural phenomena.
4 [1] The word nature is borrowed from the Old French nature and is derived from the Latin word natura, or "essential qualities, innate disposition", and in ancient times, literally meant "birth".
5 [2] In ancient philosophy, natura is mostly used as the Latin translation of the Greek word physis (φύσις), which originally related to the intrinsic characteristics that plants, animals, and other features of the world develop of their own accord.
6 [3][4] 
The concept of nature as a whole, the physical universe, is one of several expansions of the original notion;[1] it began with certain core applications of the word φύσις

In [44]:
for i, line in enumerate(data_eng):
    print(i, line)

0  nature in the broadest sense is the natural physical material world or universe
1 nature can refer to the phenomena of the physical world and also to life in general
2 the study of nature is a large if not the only part of science
3 although humans are part of nature human activity is often understood as a separate category from other natural phenomena
4  the word nature is borrowed from the old french nature and is derived from the latin word natura or essential qualities innate disposition and in ancient times literally meant birth
5  in ancient philosophy natura is mostly used as the latin translation of the greek word physis φύσις which originally related to the intrinsic characteristics that plants animals and other features of the world develop of their own accord
6  
the concept of nature as a whole the physical universe is one of several expansions of the original notion it began with certain core applications of the word φύσις by presocratic philosophers though this word ha

In [48]:
data_german = []
data_french = []
data_spanish = []
data_italian = []

lang_german = []
lang_french = []
lang_spanish = []
lang_italian = []

for i, line in german_df.iterrows():
    line = line['Text']
    if len(line) != 0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(translate_table)
        data_german.append(line)
        lang_german.append("German")     

for i, line in french_df.iterrows():
    line = line['Text']
    if len(line) != 0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(translate_table)
        data_french.append(line)
        lang_french.append("French")

for i, line in spanish_df.iterrows():
    line = line['Text']
    if len(line) != 0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(translate_table)
        data_spanish.append(line)
        lang_spanish.append("Spanish")

for i, line in italian_df.iterrows():
    line = line['Text']
    if len(line) != 0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(translate_table)
        data_italian.append(line)
        lang_italian.append("Italian")

In [65]:
df = pd.DataFrame(
    {'Text': data_eng + data_german + data_french + data_spanish + data_italian, 
    'Language': lang_eng + lang_german + lang_french + lang_spanish + lang_italian})
df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,the word nature is borrowed from the old fren...,English


In [66]:
len(df)

4386

In [67]:
# Splitting data into Train and Test data:
X, y = df.iloc[:, 0], df.iloc[:, 1]
X_train, X_test, y_train, y_test = t_split(X, y, test_size=.2, random_state=0)


In [68]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3508,)
(3508,)
(878,)
(878,)


In [69]:
# Vectorizer and Model Fitting Pipeline
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 3), analyzer='char')

#Model
pipe_lr = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('clf', linear_model.LogisticRegression())
])

In [70]:
# Model Training:
pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

In [71]:
# Model Evaluation
y_predict = pipe_lr.predict(X_test)
print(f'Accuracy: {(metrics.accuracy_score(y_test, y_predict)) * 100}')

Accuracy: 98.74715261958997
