In [13]:
import glob
import pandas as pd
import numpy as np

file_name_list = glob.glob("Data/*.csv")

feature_set = ['wellName','DEPT', 'BS', 'CALI', 'DENS', 'DTC', 'GR', 'NEUT', 'PEF', 'RESD', 'RESM', 'RESS', 'TVD']

file_list = []

for file in file_name_list:
    df = pd.read_csv(file, index_col=None, skiprows=[1])
    file_list.append(df[feature_set])

In [14]:
import matplotlib.pyplot as plt

def plt_this(y):
    plt.plot(y)
    plt.show()


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import explained_variance_score

def cross_val(clf, feature_set=['DEPT', 'BS', 'CALI', 'DENS', 'GR', 'NEUT', 'PEF', 'RESD', 'RESM', 'RESS', 'TVD']):
    
    wells = []
    EVS_list = []

    for i in range(len(file_list)):
        test_df = file_list[i]
        wells.append(test_df.iloc[0,0])
        print('%s : %s' %(i, wells[i]))

        train_list = file_list.copy()
        train_list.pop(i)
        train_df = pd.concat(train_list)

        test_x = test_df[feature_set].values
        test_y = test_df[['DTC']].values
        test_y = test_y.ravel()
        
        train_X = train_df[feature_set].values
        train_y = train_df[['DTC']].values
        train_y = train_y.ravel()
        
        # feature scaling
        scaler = StandardScaler()
        scaler.fit_transform(train_X)
        scaler.transform(test_x)

        # training
        mdl = clf()
        mdl.fit(train_X, train_y)

        # testing
        pred_y = mdl.predict(test_x)
    
        # error
        abs_error = np.abs(np.subtract(test_y, pred_y))
        
#         plt_this(abs_error)
#         plt_this(pred_y)
#         plt_this(test_y)
            
        EVS = explained_variance_score(test_y, pred_y)

        EVS_list.append(EVS)

    print()

    avg_EVS = np.mean(EVS_list)

    for i in range(len(wells)):
        print('Test score on %s : %s' %(wells[i], EVS_list[i]))

    print()
    print('Average algorithm score: %s' %avg_EVS)


# Linear regression

In [16]:
from sklearn.linear_model import LinearRegression

#feature_set = ['DEPT', 'BS', 'CALI', 'DENS', 'GR', 'NEUT', 'PEF', 'RESD', 'RESM', 'RESS', 'TVD'] #-8.0057
feature_set = ['DEPT', 'BS', 'CALI', 'DENS', 'RESD', 'RESM'] #0.372

cross_val(LinearRegression, feature_set)

0 : Cheal-A12
1 : Cheal-G3
2 : Cheal-B8
3 : Cheal-G2
4 : Cheal-A10
5 : Cheal-C3
6 : Cheal-G1
7 : Cheal-A11
8 : Cheal-C4

Test score on Cheal-A12 : 0.8054140968683544
Test score on Cheal-G3 : 0.8073261507958926
Test score on Cheal-B8 : 0.829507105464267
Test score on Cheal-G2 : 0.773382836198768
Test score on Cheal-A10 : 0.7085514002445874
Test score on Cheal-C3 : 0.7756180871691959
Test score on Cheal-G1 : 0.7692246246853904
Test score on Cheal-A11 : -2.8806147904246475
Test score on Cheal-C4 : 0.7567044659325725

Average algorithm score: 0.3716793307704867


# Random Forrest

In [17]:
from sklearn.ensemble import RandomForestRegressor

feature_set = ['DEPT', 'BS', 'CALI', 'DENS', 'GR', 'NEUT', 'PEF', 'RESD', 'RESM', 'RESS', 'TVD'] #0.826

cross_val(RandomForestRegressor, feature_set)

0 : Cheal-A12
1 : Cheal-G3
2 : Cheal-B8
3 : Cheal-G2
4 : Cheal-A10
5 : Cheal-C3
6 : Cheal-G1
7 : Cheal-A11
8 : Cheal-C4

Test score on Cheal-A12 : 0.8414483257665761
Test score on Cheal-G3 : 0.7714969883237915
Test score on Cheal-B8 : 0.9101104984631044
Test score on Cheal-G2 : 0.8488049156792641
Test score on Cheal-A10 : 0.797881725509555
Test score on Cheal-C3 : 0.7701287897111151
Test score on Cheal-G1 : 0.7394462021394268
Test score on Cheal-A11 : 0.8695679397945291
Test score on Cheal-C4 : 0.7898107998219932

Average algorithm score: 0.815410687245484


# Gradient Boosting

In [18]:
from lightgbm import LGBMRegressor

feature_set = ['DEPT', 'BS', 'CALI', 'DENS', 'GR', 'NEUT', 'PEF', 'RESD', 'RESM', 'RESS', 'TVD'] #0.872

cross_val(LGBMRegressor, feature_set)

0 : Cheal-A12
1 : Cheal-G3
2 : Cheal-B8
3 : Cheal-G2
4 : Cheal-A10
5 : Cheal-C3
6 : Cheal-G1
7 : Cheal-A11
8 : Cheal-C4

Test score on Cheal-A12 : 0.869651436814493
Test score on Cheal-G3 : 0.9161232033583621
Test score on Cheal-B8 : 0.9400560835144331
Test score on Cheal-G2 : 0.8829217106854859
Test score on Cheal-A10 : 0.8634938503050312
Test score on Cheal-C3 : 0.832434532004398
Test score on Cheal-G1 : 0.7891882784221713
Test score on Cheal-A11 : 0.9187090283541512
Test score on Cheal-C4 : 0.8342474023106184

Average algorithm score: 0.8718695028632383


# SVR

In [None]:
from sklearn.svm import SVR

feature_set = ['DEPT', 'BS', 'CALI', 'DENS', 'GR', 'NEUT', 'PEF', 'RESD', 'RESM', 'RESS', 'TVD']

cross_val(SVR, feature_set)

0 : Cheal-A12


# LSTM

# Deep NN (2 layer)