In [11]:
"""Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to 3 consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.

"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
languages_data_folder = 'data/languages/paragraphs'
dataset = load_files(languages_data_folder)

In [4]:
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.5)

In [14]:
# TASK: Build a vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens
vectorizer = TfidfVectorizer(analyzer = 'char', ngram_range = (1, 3))

In [16]:
# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
clf = Pipeline([
    ('vect', TfidfVectorizer(analyzer = 'char', ngram_range = (1, 3))),
    ('classifier', Perceptron()),
])

In [17]:
# TASK: Fit the pipeline on the training set
clf.fit(docs_train, y_train)



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...andom_state=0, shuffle=True, tol=None,
      validation_fraction=0.1, verbose=0, warm_start=False))])

In [20]:
# TASK: Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)

In [21]:
# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=dataset.target_names))

              precision    recall  f1-score   support

          ar       1.00      1.00      1.00         9
          de       1.00      1.00      1.00        76
          en       0.99      1.00      0.99        74
          es       0.98      1.00      0.99        61
          fr       1.00      1.00      1.00        67
          it       1.00      0.95      0.98        42
          ja       1.00      1.00      1.00        38
          nl       1.00      1.00      1.00        15
          pl       1.00      1.00      1.00        27
          pt       1.00      1.00      1.00        49
          ru       1.00      1.00      1.00        34

   micro avg       1.00      1.00      1.00       492
   macro avg       1.00      1.00      1.00       492
weighted avg       1.00      1.00      1.00       492



In [23]:
# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[ 9  0  0  0  0  0  0  0  0  0  0]
 [ 0 76  0  0  0  0  0  0  0  0  0]
 [ 0  0 74  0  0  0  0  0  0  0  0]
 [ 0  0  0 61  0  0  0  0  0  0  0]
 [ 0  0  0  0 67  0  0  0  0  0  0]
 [ 0  0  1  1  0 40  0  0  0  0  0]
 [ 0  0  0  0  0  0 38  0  0  0  0]
 [ 0  0  0  0  0  0  0 15  0  0  0]
 [ 0  0  0  0  0  0  0  0 27  0  0]
 [ 0  0  0  0  0  0  0  0  0 49  0]
 [ 0  0  0  0  0  0  0  0  0  0 34]]


In [25]:
# Predict the result on some short new sentences:
sentences = [
    'This is a language detection test.',
    'Ceci est un test de d\xe9tection de la langue.',
    'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
    print('The language of "%s" is "%s"' % (s, dataset.target_names[p]))

The language of "This is a language detection test." is "en"
The language of "Ceci est un test de d√©tection de la langue." is "fr"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "de"
