In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# The training data folder must be passed as first argument
languages_data_folder = './data/languages/paragraphs'
dataset = load_files(languages_data_folder)

In [3]:
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5)

In [4]:
# Build a vectorizer that splits strings into sequence of 1 to 3 characters instead of word tokens
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', use_idf=False)

In [5]:
# Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
clf = Pipeline([('vec', vectorizer),('clf', Perceptron()),])

In [6]:
# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
   ...n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False))])

In [7]:
# Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)

In [8]:
# Print the classification report
print(metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

         ar       1.00      1.00      1.00        17
         de       1.00      0.99      0.99        74
         en       0.99      1.00      0.99        78
         es       1.00      0.92      0.96        59
         fr       1.00      1.00      1.00        73
         it       0.97      1.00      0.98        32
         ja       1.00      1.00      1.00        41
         nl       1.00      1.00      1.00        19
         pl       0.87      1.00      0.93        20
         pt       0.98      1.00      0.99        46
         ru       1.00      1.00      1.00        32

avg / total       0.99      0.99      0.99       491



In [9]:
# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[17  0  0  0  0  0  0  0  0  0  0]
 [ 0 73  0  0  0  0  0  0  1  0  0]
 [ 0  0 78  0  0  0  0  0  0  0  0]
 [ 0  0  1 54  0  1  0  0  2  1  0]
 [ 0  0  0  0 73  0  0  0  0  0  0]
 [ 0  0  0  0  0 32  0  0  0  0  0]
 [ 0  0  0  0  0  0 41  0  0  0  0]
 [ 0  0  0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0  0  0 46  0]
 [ 0  0  0  0  0  0  0  0  0  0 32]]


In [10]:
# Predict the result on some short new sentences:
sentences = [
    u'I am Luís Eduardo and i am a student.',
    u'Soy Luis Eduardo una chica.',
    u'Je suis Luís Eduardo ça va.',
    u'Я Луис Эдуардо.',
    u'私はです.',
    u'Meu nome é Luís Eduardo.',
    u'Ich heiße Luís Eduardo und ich komme aus Brasilien.',
]
predicted = clf.predict(sentences)

In [11]:
for s, p in zip(sentences, predicted):
    print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))

The language of "I am Luís Eduardo." is "pt"
The language of "Soy Luis Eduardo." is "pt"
The language of "Je suis Luís Eduardo." is "pt"
The language of "Я Луис Эдуардо." is "ru"
The language of "私はLuísEduardoです." is "pt"
The language of "Meu nome é Luís Eduardo." is "pt"
The language of "Ich heiße Luís Eduardo und ich komme aus Brasilien." is "de"
