In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [4]:
from sklearn.metrics import (precision_score, recall_score, roc_auc_score)


In [5]:
raw_data = pd.read_csv('../rodpump_noImputed.csv', parse_dates=['lifetime_end'])

In [6]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
def encode_categorical(df, columnName):
    df[columnName] = labelencoder.fit_transform(df[columnName])
    return df

In [7]:
raw_data = encode_categorical(raw_data, 'bha_configuration')
raw_data = encode_categorical(raw_data, 'wellbore_category')
raw_data = encode_categorical(raw_data, 'packer_vs_tac')
raw_data = encode_categorical(raw_data, 'rod_sinker_type')
raw_data = encode_categorical(raw_data, 'manual_scale')
raw_data = encode_categorical(raw_data, 'rod_make')
raw_data = encode_categorical(raw_data, 'rod_apigrade')
raw_data = encode_categorical(raw_data, 'FAILURETYPE')

In [8]:
features = ['PrimarySetpoint','SecondarySetpoint','H2S_CONCENTRATION','StrokeLength','bha_configuration','max_unguided_dls','dls_high_in_hole','MAX_INCLINATION','AVG_PRESSURE_TUBING','AVG_PRESSURE_CASING','AVG_DIFFERENTIAL_PRESSURE','AVG_OIL_VOLUME','AVG_WATER_VOLUME','AVG_LIQUID_VOLUME','overall_max_sideload','shallow_max_sideload','max_unguided_sideload','wellbore_category','packer_vs_tac','rod_sinker_type','manual_scale','rod_has_guides','rod_apigrade']

In [9]:
X = np.array(raw_data[features])
# X = data in the form of an array
# use X to predict failure type (y)

In [10]:
y = np.array(raw_data['FAILURETYPE'])
# y = target variable

In [11]:
# divide our dataset into 4 different data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build Basic Random Forest Model

In [12]:
rf_model = RandomForestClassifier()

In [13]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Check Accuracy

In [14]:
rf_model.score(X_train,y_train)
# the 1.0 shows that we have overfitted this training data, 
# which means it won't do well on the test data set

1.0

In [15]:
rf_model.score(X_test,y_test)

0.6258064516129033

# To combat overfitting, build RFM with hyperparameters

hyperparameters include the number of decision trees in the forest and the number of features considered by each tree when splitting a node

n_estimators = number of trees in the forest

max_features = max number of features considered for splitting a node

max_depth = max number of levels in each decision tree

min_samples_split = min number of data points placed in a node before the node is split

min_samples_leaf = min number of data points allowed in a leaf node 

bootstrap = method for sampling data points (with or without replacement)

In [16]:
from sklearn.model_selection import RandomizedSearchCV

In [17]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 4, stop = 10, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]

max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [18]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [19]:
rf_model = RandomForestClassifier()

In [20]:
from sklearn.model_selection import GridSearchCV
rf_grid = GridSearchCV(estimator = rf_model, param_grid = random_grid, cv = 3, verbose = 2, n_jobs = 4)

In [21]:
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 4320 candidates, totalling 12960 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  44 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 1832 tasks      | elapsed:   10.9s
[Parallel(n_jobs=4)]: Done 5080 tasks      | elapsed:   31.4s
[Parallel(n_jobs=4)]: Done 9608 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 12960 out of 12960 | elapsed:  1.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,...
                                              random_state=None, verbose=0,
                                   

In [22]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 9}

In [23]:
rf_grid.score(X_train,y_train)
# better but needs to be improved
# we want this to be closer to 0.625 

0.8465266558966075

In [24]:
rf_grid.score(X_test,y_test)

0.6193548387096774

# Random Forest Results

In [29]:
train_rf_predictions = rf_grid.predict(X_train)
train_rf_probs = rf_grid.predict_proba(X_train)[:, 1]

rf_predictions = rf_grid.predict(X_test)
rf_probs = rf_grid.predict_proba(X_test)[:, 1]