In [2]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.metrics import (precision_score, recall_score, roc_auc_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
 
# set seed to make results reproducible
RF_SEED = 30

In [3]:
raw_data = pd.read_csv('../rodpump_noImputed.csv', parse_dates=['lifetime_end'])

In [3]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
def encode_categorical(df, columnName):
    df[columnName] = labelencoder.fit_transform(df[columnName])
    return df

In [None]:
raw_data = encode_categorical(raw_data, 'bha_configuration')
raw_data = encode_categorical(raw_data, 'wellbore_category')
raw_data = encode_categorical(raw_data, 'packer_vs_tac')
raw_data = encode_categorical(raw_data, 'rod_sinker_type')
raw_data = encode_categorical(raw_data, 'manual_scale')
raw_data = encode_categorical(raw_data, 'rod_make')
raw_data = encode_categorical(raw_data, 'rod_apigrade')
raw_data = encode_categorical(raw_data, 'FAILURETYPE')

In [6]:
features = ['PrimarySetpoint','SecondarySetpoint','H2S_CONCENTRATION','StrokeLength','bha_configuration','max_unguided_dls','dls_high_in_hole','MAX_INCLINATION','AVG_PRESSURE_TUBING','AVG_PRESSURE_CASING','AVG_DIFFERENTIAL_PRESSURE','AVG_OIL_VOLUME','AVG_WATER_VOLUME','AVG_LIQUID_VOLUME','overall_max_sideload','shallow_max_sideload','max_unguided_sideload','wellbore_category','packer_vs_tac','rod_sinker_type','manual_scale','rod_has_guides','rod_apigrade']

In [None]:
X = np.array(raw_data[features])
# X = data
y = np.array(raw_data['FAILURETYPE'])
# y = labels

In [None]:
def split_data_train_model(y, X):
    # 20% examples in test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RF_SEED)
 
    # training data fit
    # 1000 trees
    regressor = RandomForestRegressor(n_estimators=1000, random_state=RF_SEED)
    regressor.fit(x_data, y_data)
 
    return X_test, y_test, regressor

In [None]:
# trains and tests the model
X_test, y_test, rf_model = split_data_train_model(y, X)

# model performance with testing data
# class prediction
rf_predictions = rf_model.predict(X_test)
# probability
rf_probabilities = rf_model.predict_proba(X_test)

In [None]:
# calculate precision
precision = precision_score(y_test, rf_predictions, average="weighted")
# calculate recall
recall = recall_score(y_test, rf_predictions, average="weighted")
 
print("The Model Precision: {}".format(precision))
print("The Model Recall: {}".format(recall))