In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
raw_data = pd.read_csv('../data/transformed_raw.csv', parse_dates=['lifetime_start','lifetime_end'])

In [4]:
def getPercentNull(df, columnName):
    return df[columnName].isnull().sum() / (df[columnName].notnull().sum() + df[columnName].isnull().sum())

In [5]:
from sklearn.preprocessing import LabelEncoder

def encode(df, column):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    
    # classes = list(le.classes_)
    # print(column + ":")
    # for key in range(len(classes)):
        # print('[' + str(key) + '] ' + str(classes[key]))
    # print('')
    return df

In [6]:
def generate_candidate_dataset(data_in, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD):
    this_data = data_in.copy()
    for column in this_data:
        this_percent_null = getPercentNull(this_data, column)
    
        if (this_percent_null >= COLUMN_DROP_THRESHOLD):
            del this_data[column]
        elif (this_percent_null >= COLUMN_IMPUTE_THRESHOLD):
            this_data.dropna(subset=[column], axis=0, inplace=True)
        else:
            # ADD IMPUTE FUNCTION HERE
            pass

    return this_data

In [7]:
raw_data.head()

Unnamed: 0,roduid,UWI,lifetime_start,lifetime_end,FAILURETYPE,H2S_CONCENTRATION,PrimarySetpoint,SecondarySetpoint,StrokeLength,GrossStrokeLength,...,shallow_max_sideload,max_unguided_sideload,DESANDDEGAS_TYP,CHROME_LENGTH,ENDURALLOY_LENGTH,POLY_LENGTH,NIPPLE_SET_DEPTH,pump_bore,gasanchor_od,lifetime
0,GB42ZGOU04727141361583,005-64-9456,2019-07-16,2020-02-13,Tubing,0.0,80.0,65.0,165.878957,174.734193,...,174.27,174.27,Miller LLC,,,1167.96,8893.9,2.0,4.5,212
1,GB87DDTZ53468840486615,006-40-5581,2006-02-07,2006-09-22,Sucker Rod Pump,0.0,75.0,60.0,144.0,,...,,,Miller LLC,,,,9085.2,1.75,,227
2,GB30HELP48302296915492,006-40-5581,2006-09-23,2009-06-25,Sucker Rod Pump,0.0,75.0,60.0,144.0,,...,,,Miller LLC,,,,9085.2,1.5,,1006
3,GB73EZQN38331541380411,006-57-3389,2017-10-18,2018-05-15,Tubing,0.0,70.0,65.0,165.761084,165.840171,...,,,Miller LLC,,,,11505.2,1.25,,209
4,GB71WFOX64096101197026,006-57-3389,2018-05-18,2018-07-18,Sucker Rod Pump,0.0,70.0,65.0,168.259995,156.448266,...,,,Miller LLC,,,,11387.1,1.25,,61


In [8]:
categorical_columns = ['bha_configuration',
                      'wellbore_category',
                       'packer_vs_tac',
                       'rod_sinker_type',
                       'manual_scale',
                       'rod_make',
                       'rod_apigrade',
                       'DESANDDEGAS_TYP',
                       'rod_has_guides',
                       'FAILURETYPE'
                      ]

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import metrics

In [10]:
def run_svc_iteration(df, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD):

    this_data = generate_candidate_dataset(df, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD)

    for column in this_data:
        if (column in categorical_columns):
            encode(this_data, column)

    features = list(this_data)
    features.remove('FAILURETYPE')
    features.remove('roduid')
    features.remove('UWI')
    features.remove('lifetime_start')
    features.remove('lifetime_end')

    X = np.array(this_data[features])
    y = np.array(this_data['FAILURETYPE'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    reg_svc = SVC(gamma='auto')
    reg_svc.fit(X_train, y_train)
    y_pred = reg_svc.predict(X_test)
    return metrics.accuracy_score(y_test, y_pred)

In [11]:
def run_knn_iteration(df, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD, K_VALUES):
    this_data = generate_candidate_dataset(df, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD)
    
    for column in this_data:
        if (column in categorical_columns):
            encode(this_data, column)
            
    features = list(this_data)
    features.remove('FAILURETYPE')
    features.remove('roduid')
    features.remove('UWI')
    features.remove('lifetime_start')
    features.remove('lifetime_end')

    X = np.array(this_data[features])
    y = np.array(this_data['FAILURETYPE'])

    knn = KNeighborsClassifier(n_neighbors=K_VALUES)
    this_score = cross_val_score(knn, X, y, cv=8, scoring='roc_auc_ovo')
    
    return this_score.mean()

In [20]:
max_result = 0

for this_column_drop_threshold in range(1, 100, 5):
    for this_k_values in range(1, 100, 1):
        try:
            this_result = run_knn_iteration(raw_data, this_column_drop_threshold/100, 0, this_k_values)
        except:
           # print("Iteration with drop threshold " + str(this_column_drop_threshold) + " and k values " + str(this_k_values) + " failed.")
           continue
        if (this_result > max_result):
            max_result = this_result
            max_threshold = this_column_drop_threshold
            max_k_values = this_k_values
        
print("Best accuracy is " + str(max_result) + " at " + str(max_threshold) + " and k=" + str(max_k_values) + ".")

Iteration with drop threshold 66 and k values 59 failed.
Iteration with drop threshold 66 and k values 60 failed.
Iteration with drop threshold 66 and k values 61 failed.
Iteration with drop threshold 66 and k values 62 failed.
Iteration with drop threshold 66 and k values 63 failed.
Iteration with drop threshold 66 and k values 64 failed.
Iteration with drop threshold 66 and k values 65 failed.
Iteration with drop threshold 66 and k values 66 failed.
Iteration with drop threshold 66 and k values 67 failed.
Iteration with drop threshold 66 and k values 68 failed.
Iteration with drop threshold 66 and k values 69 failed.
Iteration with drop threshold 66 and k values 70 failed.
Iteration with drop threshold 66 and k values 71 failed.
Iteration with drop threshold 66 and k values 72 failed.
Iteration with drop threshold 66 and k values 73 failed.
Iteration with drop threshold 66 and k values 74 failed.
Iteration with drop threshold 66 and k values 75 failed.
Iteration with drop threshold 6

Iteration with drop threshold 81 and k values 80 failed.
Iteration with drop threshold 81 and k values 81 failed.
Iteration with drop threshold 81 and k values 82 failed.
Iteration with drop threshold 81 and k values 83 failed.
Iteration with drop threshold 81 and k values 84 failed.
Iteration with drop threshold 81 and k values 85 failed.
Iteration with drop threshold 81 and k values 86 failed.
Iteration with drop threshold 81 and k values 87 failed.
Iteration with drop threshold 81 and k values 88 failed.
Iteration with drop threshold 81 and k values 89 failed.
Iteration with drop threshold 81 and k values 90 failed.
Iteration with drop threshold 81 and k values 91 failed.
Iteration with drop threshold 81 and k values 92 failed.
Iteration with drop threshold 81 and k values 93 failed.
Iteration with drop threshold 81 and k values 94 failed.
Iteration with drop threshold 81 and k values 95 failed.
Iteration with drop threshold 81 and k values 96 failed.
Iteration with drop threshold 8

In [31]:
this_data = pd.read_csv('../rodpump_noImputed.csv')

for column in this_data:
    if (column in categorical_columns):
        encode(this_data, column)
        
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

#remove pump identifiers
features = list(this_data)
features.remove('FAILURETYPE')
features.remove('roduid')
features.remove('UWI')
features.remove('lifetime_end')
features.remove('pump_bore')

#define x and y dataset (train/ test data)
X = np.array(this_data[features])
y = np.array(this_data['FAILURETYPE'])

best_k = 0
best_metric = 0
for i in range(1, 50):
    n_neighbors=i
    knn = KNeighborsClassifier(n_neighbors)
    this_score = cross_val_score(knn, X, y, cv=8, scoring='roc_auc_ovo')
    #print(confusion_matrix(y_test, pred_i))
    #print(classification_report(y_test, pred_i))
    #print("The accuracy of the KNN model unsing a K value of " + str(n_neighbors) +" is: " + str(metrics.accuracy_score(y_test, pred_i)))
    if this_score.mean() > best_metric:
        best_k = n_neighbors
        best_metric = this_score.mean()
        
print("The best accuracy of the KNN model using a K value of " + str(best_k) +" is: " + str(best_metric))

The best accuracy of the KNN model unsing a K value of 48 is: 0.6371706328718023


In [20]:
raw_data['FAILURETYPE'].value_counts()/raw_data['FAILURETYPE'].count()

Tubing             0.522371
Sucker Rod Pump    0.355607
Rods               0.122022
Name: FAILURETYPE, dtype: float64