In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from scipy import interp
from sklearn.metrics import roc_curve, auc


seed = 42
np.random.seed(seed)

### Load the data

In [137]:
#open the locally saved csv
df = pd.read_csv('data/mgm.csv', usecols=['description', 'jobtype', 'usetype'])

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17514 entries, 0 to 17513
Data columns (total 3 columns):
description    17514 non-null object
jobtype        17514 non-null object
usetype        17514 non-null object
dtypes: object(3)
memory usage: 410.6+ KB


### Create the features and target variables

In [156]:
df = df[df.usetype != 'Mixed Occupancy']

In [157]:
X = df['description']
y = df['jobtype']

In [158]:
y = y.map({'New': 0, 'Existing': 1, 'Alteration': 2, 'Repair': 3, 'Other': 4, 'Addition': 5})

In [159]:
print ('Shape of Sparse Matrix: ', X.shape)
print ('Amount of Non-Zero occurences: ', X.nnz)
print ('sparsity: %.2f%%' % (100.0 * X.nnz /
                             (X.shape[0] * X.shape[1])))

Shape of Sparse Matrix:  (17494,)


AttributeError: 'Series' object has no attribute 'nnz'

In [160]:
from sklearn.preprocessing import label_binarize
y_bin = label_binarize(y, classes=[0,1,2,3,4,5])
n_y_bin_classes = y_bin.shape[1]

In [161]:
y_bin

array([[0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [162]:
from sklearn.pipeline import Pipeline

In [163]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm=None)),
])

In [164]:
X = pipeline.fit_transform(X)

In [165]:
X.shape

(17494, 9013)

In [166]:
X_tr, X_te, y_tr, y_te, = train_test_split(X, y, test_size=0.20, random_state=42)

In [167]:
X_tr.shape

(13995, 9013)

In [168]:
y_tr.shape

(13995,)

In [169]:
matrix_len = X_tr.shape[1]
matrix_len

9013

### Instantiate a random forest classifier, run on the target variables and score the model

In [170]:
#instantiate the classifier
rf = RandomForestClassifier(random_state=42)

In [171]:
rf.fit(X_tr, y_tr)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [172]:
y_pred = rf.predict(X_te)
print(rf.__class__.__name__, accuracy_score(y_te, y_pred))

RandomForestClassifier 0.9314089739925693


In [173]:
from sklearn.metrics import classification_report
print (classification_report(y_te, y_pred))

             precision    recall  f1-score   support

          0       0.97      0.98      0.97       908
          1       0.87      0.89      0.88       795
          2       0.94      0.91      0.93       650
          3       0.93      0.95      0.94       446
          4       0.94      0.92      0.93       483
          5       0.96      0.91      0.93       217

avg / total       0.93      0.93      0.93      3499



In [174]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("StDev:", scores.std())

In [175]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [176]:
y_tr_scores = cross_val_score(rf, X_tr, y_tr, cv=kfold, scoring="accuracy")

In [177]:
display_scores(y_tr_scores)

Scores: [0.91714286 0.92785714 0.92928571 0.92642857 0.92357143 0.92852037
 0.92566119 0.9235168  0.92637598 0.92208721]
Mean: 0.9250447258245688
StDev: 0.0034312492038413175


In [178]:
y_te = cross_val_score(rf, X_te, y_te, cv=kfold, scoring="accuracy")

In [179]:
display_scores(y_te)

Scores: [0.79714286 0.82       0.81428571 0.81142857 0.81142857 0.84
 0.84857143 0.80571429 0.81428571 0.79083095]
Mean: 0.8153688088415881
StDev: 0.01670765232567413


In [None]:
# Plot a learning curve chart
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores =\
                learning_curve(estimator=rf,
                               X=X_tr,
                               y=y_tr,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=kfold,
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')
plt.title('Accuracy')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.75, 1.01])
plt.tight_layout()
plt.show()

In [112]:
del(model)

In [113]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.constraints import maxnorm
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [116]:
model = Sequential()

In [117]:
model.add(Dense(100, input_dim=matrix_len, kernel_initializer='uniform', activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(100, input_dim=matrix_len, kernel_initializer='uniform', activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(1, input_dim=matrix_len, kernel_initializer='uniform', activation='softmax'))

In [118]:
model.output_shape

(None, 1)

In [119]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 100)               901400    
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 1)                 101       
Total params: 911,601
Trainable params: 911,601
Non-trainable params: 0
_________________________________________________________________


In [125]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [129]:
X_tr.shape

(13995, 9013)

In [130]:
y_tr.shape

(13995,)

In [132]:
X_te.shape

(3499, 9013)

In [134]:
y_te.shape

(10,)

In [128]:
model.fit(X_tr, y_tr, epochs=100, verbose=1, validation_data=(X_te, y_te))

ValueError: Input arrays should have the same number of samples as target arrays. Found 3499 input samples and 10 target samples.

In [None]:
score = model.evaluate(X_te, y_te, batch_size=32)

In [None]:
y_pred = model.predict(X_te, batch_size=32)

### Create a deep learning model with keras and compare to the random forest

In [None]:
from sklearn.metrics import roc_curve
y_pred_keras = model.predict(X_te_tfidf).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_te, y_pred_keras)

In [None]:
y_pred_keras = model.predict(X_te_tfidf, batch_size=32).ravel()

In [None]:
y_pred_keras2 = np.squeeze(model.predict(X_te_tfidf))
threshold = 0.5

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_te, y_pred_keras2 > threshold))

In [None]:
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_te, y_pred_keras)

In [None]:
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)

In [None]:
y_pred_rf = rf.predict_proba(X_te_tfidf)[:, 1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_te, y_pred_rf)
auc_rf = auc(fpr_rf, tpr_rf)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
# Zoom in view of the upper left corner.
plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.98, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()

In [None]:
y_pred_classes = []
for i in y_pred:
    if i > .5:
        y_pred_classes.append(1)
    else:
        y_pred_classes.append(0)

In [None]:
# Plot the confusion matrix
from sklearn.metrics import confusion_matrix
confmat_y = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(confmat_y, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat_y.shape[0]):
    for j in range(confmat_y.shape[1]):
        ax.text(x=j, y=i, s=confmat_y[i, j], va='center', ha='center')

plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.xticks([0, 1], ['Commercial', 'Residential'])
plt.yticks([0, 1], ['Commercial', 'Residential'], rotation='vertical', va='center')
ax.grid(False)
plt.tight_layout()
plt.show()

### Convert predictions to text labels and join with the main dataframe to see where the errors occurred 

In [None]:
real_v_pred = pd.concat([y_te.reset_index(), pd.Series(y_pred_classes)], axis=1, join='inner', ignore_index=True).reset_index(drop=True).set_index([1])

In [None]:
real_v_pred.reset_index(inplace=True)
real_v_pred.columns = ['test', 'index', 'pred']

In [None]:
errors = real_v_pred[real_v_pred.test != real_v_pred.pred]
errors.set_index(keys='index', inplace=True)

In [None]:
df_errors = df.join(errors, how='inner')

### Descriptions of where 'Residential' was misclassified

In [None]:
for row in df_errors['description'][df_errors.usetype == 'Residential']:
    print("-", row)

### Descriptions of where 'Commercial' was misclassified

In [None]:
for row in df_errors['description'][df_errors.usetype == 'Commercial']:
    print("-", row)