In [1]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
%matplotlib inline

In [2]:
!fasttext

usage: fasttext <command> <args>

The commands supported by fasttext are:

  supervised              train a supervised classifier
  quantize                quantize a model to reduce the memory usage
  test                    evaluate a supervised classifier
  predict                 predict most likely labels
  predict-prob            predict most likely labels with probabilities
  skipgram                train a skipgram model
  cbow                    train a cbow model
  print-word-vectors      print word vectors given a trained model
  print-sentence-vectors  print sentence vectors given a trained model
  print-ngrams            print ngrams given a trained model and word
  nn                      query for nearest neighbors
  analogies               query for analogies
  dump                    dump arguments,dictionary,input/output vectors



In [3]:
def fasttext_predict_prob(model_file_name,query_list,encoding='utf-8'):
    cmd = 'fastText.exe'
    fp = open('temp.txt','wt',encoding=encoding)
    fp.writelines([x+'\n' for x in query_list])
    fp.close()
    commands = ' predict-prob '+model_file_name+' '+'temp.txt '+str(1)
    print('Executing : ' ,cmd+commands)
    out = os.popen(cmd+commands).read()
    out = out.splitlines()
    return [item.split(' ') for item in out]

In [4]:
fp = open('dbpedia_csv/classes.txt','rt')
class_names =  fp.readlines()
fp.close()
class_names = [x.strip() for x in class_names]
class_names

['Company',
 'EducationalInstitution',
 'Artist',
 'Athlete',
 'OfficeHolder',
 'MeanOfTransportation',
 'Building',
 'NaturalPlace',
 'Village',
 'Animal',
 'Plant',
 'Album',
 'Film',
 'WrittenWork']

In [5]:
fp = open('dbpedia.test','rt',encoding='utf-8')
lines =  fp.readlines()
fp.close()

In [6]:
labels = []
qlist = []
for line in lines:
    i = line.index(',')
    labels.append(line[:i])
    qlist.append(line[i+1:].strip())

In [7]:
labels = [x.split('__')[-1].strip() for x in labels]

In [8]:
out = fasttext_predict_prob('models/dbpedia_1.bin',qlist)

Executing :  fastText.exe predict-prob models/dbpedia_1.bin temp.txt 1


In [9]:
predicted = [x[0] for x in out]
predicted = [x.split('__')[-1].strip() for x in predicted]

In [10]:
y_true = [class_names.index(x) for x in labels]
y_pred = [class_names.index(x) for x in predicted]

In [11]:
cm = confusion_matrix(labels, predicted, labels=class_names)
print ("Confusion matrix \n",cm)

Confusion matrix 
 [[4767   39    8    4   14   34   50    3    2    0    7   12    9   51]
 [  33 4924    0    0    5    1   30    1    1    0    0    0    2    3]
 [  18    3 4772   13   84    0    8    1    0    0    0   27   17   57]
 [   3    1   18 4960   14    0    0    1    0    2    0    0    1    0]
 [   5    3   47   10 4923    2    4    0    2    1    0    0    1    2]
 [  24    1    0    0    4 4957    8    1    0    0    2    0    1    2]
 [  46   25    1    0    6    7 4880   21    7    2    0    1    1    3]
 [   2    1    0    0    1    0   14 4973    9    0    0    0    0    0]
 [   0    0    1    0    1    0    8   11 4978    0    0    0    0    1]
 [   2    0    0    0    0    1    0    6    0 4940   50    0    0    1]
 [  11    1    0    0    0    0    1    0    0    8 4978    0    0    1]
 [   1    0   10    1    0    0    1    0    0    0    0 4956   19   12]
 [   5    1    3    1    1    2    1    1    0    0    1   14 4937   33]
 [  36    4   15    2    8    3 

In [12]:
print (" Classification Report \n",classification_report(y_true, y_pred, target_names=class_names,digits=3))

 Classification Report 
                         precision    recall  f1-score   support

               Company      0.962     0.953     0.958      5000
EducationalInstitution      0.984     0.985     0.985      5000
                Artist      0.979     0.954     0.966      5000
               Athlete      0.994     0.992     0.993      5000
          OfficeHolder      0.973     0.985     0.979      5000
  MeanOfTransportation      0.990     0.991     0.991      5000
              Building      0.974     0.976     0.975      5000
          NaturalPlace      0.991     0.995     0.993      5000
               Village      0.996     0.996     0.996      5000
                Animal      0.997     0.988     0.993      5000
                 Plant      0.988     0.996     0.992      5000
                 Album      0.988     0.991     0.990      5000
                  Film      0.983     0.987     0.985      5000
           WrittenWork      0.967     0.976     0.972      5000

             