In [1]:
"""Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to 3 consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.

"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD
# Modified by: Carlos Ezequiel <cafezequiel@gmail.com>

"Build a language detector model\n\nThe goal of this exercise is to train a linear classifier on text features\nthat represent sequences of up to 3 consecutive characters so as to be\nrecognize natural languages by using the frequencies of short character\nsequences as 'fingerprints'.\n\n"

In [24]:
# Import packages
from __future__ import print_function

import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [18]:
# The training data folder must be passed as first argument
languages_data_folder = '/Users/carlos/src/scikit-learn/doc/tutorial/text_analytics/data/languages/short_paragraphs'
dataset = load_files(languages_data_folder)

In [26]:
# Check the dataset
print(len(dataset.data), len(dataset.target))
dataset.target_names

8754 8754


['ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru']

In [19]:
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.5)

In [45]:
# TASK: Build a vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens
from sklearn.feature_extraction.text import CountVectorizer

sample = docs_train[0]
vectorizer = TfidfVectorizer(ngram_range=(1,3), analyzer='char',
                             use_idf=False)

In [52]:
# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
perceptron = Perceptron()
clf = Pipeline([
    ('tfidf', vectorizer),
    ('est', perceptron)
])

In [53]:
# TASK: Fit the pipeline on the training set
clf.fit(docs_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True...n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False))])

In [55]:
# TASK: Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)

# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=dataset.target_names))

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

#import matplotlib.pyplot as plt
#plt.matshow(cm, cmap=plt.cm.jet)
#plt.show()

# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

print(dataset.target_names)

for s, p in zip(sentences, predicted):
    print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))

             precision    recall  f1-score   support

         ar       0.40      0.99      0.57       152
         de       0.97      0.98      0.98       527
         en       0.95      0.92      0.94       546
         es       0.97      0.83      0.89       545
         fr       0.99      0.92      0.96       524
         it       0.97      0.92      0.94       510
         ja       0.99      0.86      0.92       293
         nl       0.98      0.95      0.97       303
         pl       0.94      0.90      0.92       499
         pt       1.00      0.99      1.00       478

avg / total       0.95      0.92      0.93      4377

[[151   0   1   0   0   0   0   0   0   0]
 [  6 517   2   1   0   0   1   0   0   0]
 [ 34   1 505   1   0   2   2   1   0   0]
 [ 62   2   2 451   2   8   0   1  17   0]
 [ 32   1   3   1 482   3   0   0   2   0]
 [ 25   0   3   4   1 470   0   0   7   0]
 [ 23   7   7   0   0   0 253   1   2   0]
 [ 11   2   0   0   0   0   0 289   0   1]
 [ 33   2   4   7