In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
raw_data = pd.read_csv('../data/transformed_raw.csv', parse_dates=['lifetime_start','lifetime_end'])

In [3]:
def getPercentNull(df, columnName):
    return df[columnName].isnull().sum() / (df[columnName].notnull().sum() + df[columnName].isnull().sum())

In [17]:
from sklearn.preprocessing import LabelEncoder

def encode(df, column):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    
    # classes = list(le.classes_)
    # print(column + ":")
    # for key in range(len(classes)):
        # print('[' + str(key) + '] ' + str(classes[key]))
    # print('')
    return df

In [5]:
def generate_candidate_dataset(data_in, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD):
    this_data = data_in.copy()
    for column in this_data:
        this_percent_null = getPercentNull(this_data, column)
    
        if (this_percent_null >= COLUMN_DROP_THRESHOLD):
            del this_data[column]
        elif (this_percent_null >= COLUMN_IMPUTE_THRESHOLD):
            this_data.dropna(subset=[column], axis=0, inplace=True)
        else:
            # ADD IMPUTE FUNCTION HERE
            pass

    return this_data

In [6]:
categorical_columns = ['bha_configuration',
                      'wellbore_category',
                       'packer_vs_tac',
                       'rod_sinker_type',
                       'manual_scale',
                       'rod_make',
                       'rod_apigrade',
                       'DESANDDEGAS_TYP',
                       'rod_has_guides',
                       'FAILURETYPE'
                      ]

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics

In [14]:
def run_iteration(df, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD):

    this_data = generate_candidate_dataset(df, COLUMN_DROP_THRESHOLD, COLUMN_IMPUTE_THRESHOLD)

    for column in this_data:
        if (column in categorical_columns):
            encode(this_data, column)

    features = list(this_data)
    features.remove('FAILURETYPE')
    features.remove('roduid')
    features.remove('UWI')
    features.remove('lifetime_start')
    features.remove('lifetime_end')

    X = np.array(this_data[features])
    y = np.array(this_data['FAILURETYPE'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    reg_svc = SVC()
    reg_svc.fit(X_train, y_train)
    y_pred = reg_svc.predict(X_test)
    return metrics.accuracy_score(y_test, y_pred)

In [29]:
max_result = 0

for this_threshold in range(1,100,1):
    try:
        this_result = run_iteration(raw_data, this_threshold/100, 0)
    except:
        print("Iteration with threshold " + str(this_threshold) + " failed.")
        continue
    if (this_result > max_result):
        max_result = this_result
        max_threshold = this_threshold
        
print("Best accuracy is " + str(max_result) + " at " + str(max_threshold) + ".")

Iteration with threshold 99 failed.
Best accuracy is 0.6493506493506493 at 55.
