In [36]:
import pandas as pd
import numpy as np

In [37]:
from sklearn.metrics import (precision_score, recall_score, roc_auc_score)
from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import RandomForestRegressor
 
# set seed to make results reproducible
RF_SEED = 101

In [38]:
raw_data = pd.read_csv('../rodpump_noImputed.csv', parse_dates=['lifetime_end'])

In [39]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
def encode_categorical(df, columnName):
    df[columnName] = labelencoder.fit_transform(df[columnName])
    return df

In [40]:
raw_data = encode_categorical(raw_data, 'bha_configuration')
raw_data = encode_categorical(raw_data, 'wellbore_category')
raw_data = encode_categorical(raw_data, 'packer_vs_tac')
raw_data = encode_categorical(raw_data, 'rod_sinker_type')
raw_data = encode_categorical(raw_data, 'manual_scale')
raw_data = encode_categorical(raw_data, 'rod_make')
raw_data = encode_categorical(raw_data, 'rod_apigrade')
raw_data = encode_categorical(raw_data, 'FAILURETYPE')

In [41]:
features = ['PrimarySetpoint','SecondarySetpoint','H2S_CONCENTRATION','StrokeLength','bha_configuration','max_unguided_dls','dls_high_in_hole','MAX_INCLINATION','AVG_PRESSURE_TUBING','AVG_PRESSURE_CASING','AVG_DIFFERENTIAL_PRESSURE','AVG_OIL_VOLUME','AVG_WATER_VOLUME','AVG_LIQUID_VOLUME','overall_max_sideload','shallow_max_sideload','max_unguided_sideload','wellbore_category','packer_vs_tac','rod_sinker_type','manual_scale','rod_has_guides','rod_apigrade']

In [42]:
X = np.array(raw_data[features])
# X = data in the form of an array
y = np.array(raw_data['FAILURETYPE'])
# y = labels, target variable

In [46]:
# divide our dataset into 4 different data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RF_SEED)

# Build Basic Random Forest Model

In [47]:
rf_model = RandomForestClassifier()

In [48]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Check Accuracy

In [51]:
rf_model.score(X_train,y_train)
# the 1.0 shows that we have overfitted this training data, 
# which means it won't do well on the test data set

1.0

In [52]:
rf_model.score(X_test,y_test)

0.6129032258064516

def split_data_train_model(y, X):
    # 20% examples in test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RF_SEED)
 
    # training data fit
    # 1000 trees
    regressor = RandomForestRegressor(n_estimators=1000, random_state=RF_SEED)
    regressor.fit(X, y)
 
    return X_test, y_test, regressor

# trains and tests the model
X_test, y_test, rf_model = split_data_train_model(y, X)

# model performance with testing data
# class prediction
rf_predictions = rf_model.predict(X_test)
# probability
#rf_probabilities = rf_model.predict_prob(X_test)

# calculate precision
precision = precision_score(y_test, rf_predictions, average="weighted")
# calculate recall
recall = recall_score(y_test, rf_predictions, average="weighted")
 
print("The Model Precision: {}".format(precision))
print("The Model Recall: {}".format(recall))