In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

### Load the data

In [17]:
#open the locally saved csv
df = pd.read_csv('C:/Users/595217/general-assembly/project-final/data/mgm.csv', usecols=['description', 'jobtype', 'usetype'])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17669 entries, 0 to 17668
Data columns (total 3 columns):
description    17669 non-null object
jobtype        17669 non-null object
usetype        17669 non-null object
dtypes: object(3)
memory usage: 414.2+ KB


### Create the features and target variables

In [19]:
df = df[df.usetype != 'Mixed Occupancy']

In [20]:
X = df['description']
z = df['usetype']
y = df['jobtype']

In [21]:
# y = y.map({'Commercial': 0, 'Residential': 1})
y = y.map({'New': 0, 'Existing': 1, 'Alteration': 2, 'Repair': 3, 'Other': 4, 'Addition': 5})

In [51]:
y.value_counts(), z.value_counts()

(0    4820
 1    4017
 2    3083
 3    2322
 4    2264
 5    1143
 Name: jobtype, dtype: int64, Residential    10439
 Commercial      7210
 Name: usetype, dtype: int64)

In [23]:
# y_bin = label_binarize(y, classes=[0,1,2])
# n_y_bin_classes = y_bin.shape[1]
z_bin = label_binarize(z, classes=[0,1,2,3,4,5])
n_z_bin_classes = z_bin.shape[1]

In [24]:
X_tr, X_te, y_tr, y_te, = train_test_split(X, y, test_size=0.30, random_state=42)

In [25]:
# create the BOW representation
bow_transform = text.CountVectorizer(min_df=0, stop_words="english")
X_tr_bow = bow_transform.fit_transform(X_tr)
X_te_bow = bow_transform.transform(X_te)
len(bow_transform.vocabulary_)

8190

In [26]:
#create tf-idf representation using the bow matrix
tfidf_trfm = text.TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_trfm.transform(X_te_bow)
X_te_tfidf.shape

(5295, 8190)

### Instantiate a random forest classifier, run on the target variables and score the model

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [27]:
#instantiate the classifier
rf = OneVsRestClassifier(RandomForestClassifier())

In [28]:
rf.fit(X_tr_tfidf, y_tr)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
y_pred = rf.predict(X_te_tfidf)
print(rf.__class__.__name__, accuracy_score(y_te, y_pred))

RandomForestClassifier 0.9242681775259679


In [30]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("StDev:", scores.std())

In [31]:
y_tr_tfidf_scores = cross_val_score(rf, X_tr_tfidf, y_tr, cv=10, scoring="accuracy")

In [32]:
display_scores(y_tr_tfidf_scores)

Scores: [0.90153349 0.9135004  0.90703314 0.90850202 0.91255061 0.91093117
 0.90923825 0.91247974 0.91247974 0.89384117]
Mean: 0.9082089747223531
StDev: 0.005867384824918001


In [33]:
y_te_tfidf_scores = cross_val_score(rf, X_te_tfidf, y_te, cv=10, scoring="accuracy")

In [34]:
display_scores(y_te_tfidf_scores)

Scores: [0.85928705 0.82330827 0.86037736 0.85849057 0.82986767 0.83175803
 0.84877127 0.86174242 0.84469697 0.81024668]
Mean: 0.8428546298295612
StDev: 0.017186729511305333


In [35]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [36]:
model = Sequential()

In [37]:
model.add(Dense(12, input_dim=8190, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, input_dim=8190, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, input_dim=8190, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, input_dim=8190, kernel_initializer='uniform', activation='sigmoid'))

In [38]:
model.output_shape

(None, 1)

In [39]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 12)                98292     
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 104       
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 98,477
Trainable params: 98,477
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [41]:
model.fit(X_tr_tfidf, y_tr, batch_size=32, epochs=100, verbose=1, validation_data=(X_te_tfidf, y_te))

Train on 12354 samples, validate on 5295 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x15e0c2a54e0>

In [42]:
score = model.evaluate(X_te_tfidf, y_te, batch_size=32)



In [43]:
y_pred = model.predict(X_te_tfidf, batch_size=32)

### Convert predictions to text labels and join with the main dataframe to see where the errors occurred 

In [44]:
y_pred_classes = []
for i in y_pred:
    if i >= .5:
        y_pred_classes.append(1)
    else:
        y_pred_classes.append(0)

In [45]:
real_v_pred = pd.concat([y_te.reset_index(), pd.Series(y_pred_classes)], axis=1, join='inner', ignore_index=True).reset_index(drop=True).set_index([1])

In [46]:
real_v_pred.reset_index(inplace=True)
real_v_pred.columns = ['test', 'index', 'pred']

In [47]:
errors = real_v_pred[real_v_pred.test != real_v_pred.pred]
errors.set_index(keys='index', inplace=True)

In [48]:
df_errors = df.join(errors, how='inner')

In [49]:
df_errors.to_csv('df_errors.csv')

### Create a deep learning model with keras and compare to the random forest

In [50]:
from sklearn.metrics import roc_curve
y_pred_keras = model.predict(X_te_tfidf).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_te, y_pred_keras)

ValueError: multiclass format is not supported

In [None]:
y_pred_keras = model.predict(X_te_tfidf, batch_size=32).ravel()

In [None]:
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_te, y_pred_keras)

In [None]:
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)

In [None]:
y_pred_rf = rf.predict_proba(X_te_tfidf)[:, 1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_te, y_pred_rf)
auc_rf = auc(fpr_rf, tpr_rf)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
# Zoom in view of the upper left corner.
plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.98, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()