In [5]:
import numpy as np
from sklearn.datasets import load_files
# The training data folder must be passed as first argument
try:
    dataset = load_files('./wikidata/short_paragraphs')
except OSError as ex:
    print(ex)
    print("Couldn't import the data, did you unzip the wikidata.zip folder?")
    exit(-1)
docs = dataset.data
y = dataset.target

# TASK: Split the dataset in training and test set
# (use 20% of the data for test):
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(
    docs, y, test_size=0.20, random_state=42)

# TASK: Build a an vectorizer that splits
# strings into sequence of 1 to 3
# characters instead of word tokens
# using the class TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 4),
                             analyzer='char')

# TASK: Use the function make_pipeline to build a
#       vectorizer / classifier pipeline
#       using the previous analyzer
#       and a classifier of choice.
#       The pipeline instance should be
#       stored in a variable named model
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
model = make_pipeline(vectorizer, clf)

# TASK: Fit the pipeline on the training set
model.fit(docs_train, y_train)

# TASK: Predict the outcome on the testing set.
# Store the result in a variable named y_predicted
y_predicted = model.predict(docs_test)

# TASK: Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))

# TASK: Print the confusion matrix. Bonus points if you make it pretty.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        62
          1       0.97      0.98      0.97       193
          2       0.91      0.99      0.95       221
          3       0.95      0.94      0.95       213
          4       0.97      0.99      0.98       223
          5       0.97      0.97      0.97       198
          6       1.00      0.82      0.90        78
          7       0.99      0.96      0.98       112
          8       0.97      0.94      0.95       207
          9       1.00      0.98      0.99       185

avg / total       0.97      0.97      0.96      1692

[[ 62   0   0   0   0   0   0   0   0   0]
 [  0 189   3   1   0   0   0   0   0   0]
 [  0   1 219   0   1   0   0   0   0   0]
 [  0   0   1 201   0   4   0   1   6   0]
 [  0   0   1   1 221   0   0   0   0   0]
 [  0   0   3   2   0 193   0   0   0   0]
 [  0   5   6   1   2   0  64   0   0   0]
 [  0   0   2   0   1   0   0 108   1   0]
 [  0   0   2   6

Try using a different ngram size-- 4 seems to perform well:

In [6]:
docs_train, docs_test, y_train, y_test = train_test_split(
    docs, y, test_size=0.20, random_state=42)

vectorizer = TfidfVectorizer(ngram_range=(1, 4),
                             analyzer='char')

clf = LogisticRegression()
model = make_pipeline(vectorizer, clf)
model.fit(docs_train, y_train)

y_predicted = model.predict(docs_test)

print(classification_report(y_test, y_predicted))

print(confusion_matrix(y_test, y_predicted))


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        62
          1       0.97      0.98      0.97       193
          2       0.91      0.99      0.95       221
          3       0.95      0.94      0.95       213
          4       0.97      0.99      0.98       223
          5       0.97      0.97      0.97       198
          6       1.00      0.82      0.90        78
          7       0.99      0.96      0.98       112
          8       0.97      0.94      0.95       207
          9       1.00      0.98      0.99       185

avg / total       0.97      0.97      0.96      1692

[[ 62   0   0   0   0   0   0   0   0   0]
 [  0 189   3   1   0   0   0   0   0   0]
 [  0   1 219   0   1   0   0   0   0   0]
 [  0   0   1 201   0   4   0   1   6   0]
 [  0   0   1   1 221   0   0   0   0   0]
 [  0   0   3   2   0 193   0   0   0   0]
 [  0   5   6   1   2   0  64   0   0   0]
 [  0   0   2   0   1   0   0 108   1   0]
 [  0   0   2   6

In [11]:
docs_train, docs_test, y_train, y_test = train_test_split(
    docs, y, test_size=0.10, random_state=42)

vectorizer = TfidfVectorizer(ngram_range=(1, 4),
                             analyzer='char')

clf = LogisticRegression()
model = make_pipeline(vectorizer, clf)
model.fit(docs_train, y_train)

y_predicted = model.predict(docs_test)

print(classification_report(y_test, y_predicted))

print(confusion_matrix(y_test, y_predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        28
          1       0.97      0.97      0.97       107
          2       0.96      1.00      0.98       109
          3       0.95      0.92      0.93       121
          4       0.98      1.00      0.99       106
          5       0.97      0.99      0.98        96
          6       1.00      0.87      0.93        39
          7       0.98      0.98      0.98        51
          8       0.93      0.95      0.94        98
          9       1.00      0.99      0.99        91

avg / total       0.97      0.97      0.97       846

[[ 28   0   0   0   0   0   0   0   0   0]
 [  0 104   2   1   0   0   0   0   0   0]
 [  0   0 109   0   0   0   0   0   0   0]
 [  0   0   0 111   0   3   0   1   6   0]
 [  0   0   0   0 106   0   0   0   0   0]
 [  0   0   0   1   0  95   0   0   0   0]
 [  0   3   1   0   1   0  34   0   0   0]
 [  0   0   0   0   0   0   0  50   1   0]
 [  0   0   0   4

In [12]:
dataset.target_names

['ar', 'de', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'pt', 'ru']

In [24]:
from sklearn.ensemble import RandomForestClassifier
docs_train, docs_test, y_train, y_test = train_test_split(
    docs, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(ngram_range=(1, 4),
                             analyzer='char')

clf = RandomForestClassifier()
model = make_pipeline(vectorizer, clf)
model.fit(docs_train, y_train)

y_predicted = model.predict(docs_test)

print(classification_report(y_test, y_predicted))

print(confusion_matrix(y_test, y_predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        62
          1       0.85      0.91      0.88       193
          2       0.73      0.92      0.81       221
          3       0.71      0.69      0.70       213
          4       0.82      0.82      0.82       223
          5       0.80      0.86      0.83       198
          6       1.00      0.42      0.59        78
          7       1.00      0.92      0.96       112
          8       0.82      0.73      0.77       207
          9       1.00      0.98      0.99       185

avg / total       0.84      0.83      0.83      1692

[[ 62   0   0   0   0   0   0   0   0   0]
 [  0 176   6   3   5   1   0   0   2   0]
 [  0   3 203   5   8   1   0   0   1   0]
 [  0   1  15 148   7  19   0   0  23   0]
 [  0   5  15  13 183   6   0   0   1   0]
 [  0   1  10   6   6 170   0   0   5   0]
 [  0  21  15   0   7   2  33   0   0   0]
 [  0   0   4   3   0   1   0 103   1   0]
 [  0   0   8  29

In [26]:
docs_train, docs_test, y_train, y_test = train_test_split(
    docs, y, test_size=0.10, random_state=42)

vectorizer = TfidfVectorizer(ngram_range=(1, 4),
                             analyzer='char')

clf = LogisticRegression(C=10)
model = make_pipeline(vectorizer, clf)
model.fit(docs_train, y_train)

y_predicted = model.predict(docs_test)

print(classification_report(y_test, y_predicted))

print(confusion_matrix(y_test, y_predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        28
          1       1.00      0.99      1.00       107
          2       0.99      1.00      1.00       109
          3       0.95      0.94      0.95       121
          4       0.98      1.00      0.99       106
          5       0.97      0.99      0.98        96
          6       1.00      1.00      1.00        39
          7       1.00      0.98      0.99        51
          8       0.96      0.95      0.95        98
          9       1.00      0.99      0.99        91

avg / total       0.98      0.98      0.98       846

[[ 28   0   0   0   0   0   0   0   0   0]
 [  0 106   0   1   0   0   0   0   0   0]
 [  0   0 109   0   0   0   0   0   0   0]
 [  0   0   0 114   1   3   0   0   3   0]
 [  0   0   0   0 106   0   0   0   0   0]
 [  0   0   0   1   0  95   0   0   0   0]
 [  0   0   0   0   0   0  39   0   0   0]
 [  0   0   0   0   0   0   0  50   1   0]
 [  0   0   0   4

In [27]:
import gzip
import dill
with gzip.open('my_model.dill.gz', 'wb') as f:
    dill.dump([model, dataset.target_names], f)