In [43]:
# https://automl.github.io/auto-sklearn/stable/
import numpy as np
import pandas as pd
import os


In [44]:
# Set the path of the spooky author dataset
SPOOKY_PATH = 'spooky'


def load_spooky_dataset(dataset_type):
    filepath = os.path.join(SPOOKY_PATH, dataset_type, dataset_type + ".csv")
    print(filepath)
    return pd.read_csv(filepath)

In [45]:
train = load_spooky_dataset("train")
train.head()

spooky/train/train.csv


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [46]:
train["author"] = train.author.map({'EAP':0, 'HPL':1, 'MWS':2})

In [47]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",0
1,id17569,It never once occurred to me that the fumbling...,1
2,id11008,"In his left hand was a gold snuff box, from wh...",0
3,id27763,How lovely is spring As we looked from Windsor...,2
4,id12958,"Finding nothing else, not even gold, the Super...",1


In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train['text'], train['author'])

In [49]:
import autosklearn.classification

In [50]:
automl = autosklearn.classification.AutoSklearnClassifier()

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape, X_train.shape

((14684, 22286), (14684,))

In [52]:
from sklearn.feature_extraction.text import  TfidfTransformer
tf_transformer = TfidfTransformer()
X_train_tfidf = tf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(14684, 22286)

In [53]:
automl.fit(X_train_tfidf, y_train)

In [54]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tf_transformer.transform(X_test_counts)

In [55]:
y_pred = automl.predict(X_test_tfidf)
y_pred_prob = automl.predict_proba(X_test_tfidf)

In [56]:
# Accuracy
from sklearn.metrics import accuracy_score

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Accuracy Score: 0.845352400409


In [57]:
# Log loss
from sklearn.metrics import log_loss

print("Log Loss:", log_loss(y_test, y_pred_prob))

Log Loss: 0.494008053742


In [58]:
print(automl.show_models())

[(0.140000, SimpleClassificationPipeline({'imputation:strategy': 'median', 'classifier:sgd:loss': 'squared_hinge', 'rescaling:__choice__': 'normalize', 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:learning_rate': 'invscaling', 'classifier:sgd:average': 'True', 'classifier:sgd:n_iter': 41, 'classifier:sgd:power_t': 0.07376212096528387, 'classifier:sgd:l1_ratio': 4.493338624232962e-06, 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'no_preprocessing', 'classifier:__choice__': 'sgd', 'classifier:sgd:alpha': 0.00043300391715554303, 'balancing:strategy': 'weighting', 'classifier:sgd:eta0': 0.0236040895596051, 'classifier:sgd:penalty': 'elasticnet'},
dataset_properties={
  'multilabel': False,
  'signed': False,
  'target_type': 'classification',
  'multiclass': True,
  'sparse': True,
  'task': 2})),
(0.120000, SimpleClassificationPipeline({'classifier:liblinear_svc:tol': 0.00010000000000000009, 'classifier:liblinear_svc:loss': 'squared_hinge', 'clas

In [59]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.84      0.86      0.85      1993
          1       0.86      0.82      0.84      1398
          2       0.84      0.84      0.84      1504

avg / total       0.85      0.85      0.85      4895



In [60]:
# Confusion Matrix
conf_mx = confusion_matrix(y_test, y_pred)
conf_mx

array([[1719,  109,  165],
       [ 170, 1153,   75],
       [ 163,   75, 1266]])