# Feature on Production - Correlations

Here in this file, we will be taking all of our features and running them through some other regression techniques on production to get a better understanding for which features correlate the most with production. We will be using LASSO, F-Test and R^2 regression techniques to get regression scores on the different features to then pick out which features to use.

In [1]:
import numpy as np
import pandas as pd
import math
import seaborn as sns

from pandas import read_excel
from sklearn import linear_model
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# Loading in our feature and production data.
all_data = pd.read_excel("main_file_final.xlsx")
all_data.set_index("Field name")

Unnamed: 0_level_0,NPDID field,Orig. inplace ass. liquid \n[mill Sm3],Orig. inplace ass. gas \n[bill Sm3],Orig. inplace free gas \n[bill Sm3],Orig. inplace oil \n[mill Sm3],Orig. recoverable oil \n[mill Sm3],Orig. recoverable gas \n[bill Sm3],Orig. recoverable NGL \n[mill tonn],Orig. recoverable cond. \n[mill Sm3],Orig. recoverable oil eq. \n[mill Sm3 o.e],...,Gas - Month 574,Gas - Month 575,Gas - Month 576,Gas - Month 577,Gas - Month 578,Gas - Month 579,Gas - Month 580,Gas - Month 581,Gas - Month 582,Gas - Month 583
Field name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ALBUSKJELL,43437,0,64,0.00,56,7.35,15.53,0.99,0.00,24.761,...,,,,,,,,,,
ALVE,4444332,3,1,13.50,3,2.39,9.08,1.41,0.00,14.149,...,,,,,,,,,,
ALVHEIM,2845712,0,10,11.69,109,51.79,10.06,0.00,0.00,61.850,...,,,,,,,,,,
ATLA,21106284,0,0,1.88,0,0.40,1.40,0.00,0.00,1.800,...,,,,,,,,,,
BALDER,43562,0,14,0.00,263,106.87,3.25,0.00,0.00,110.120,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YTTERGRYTA,4973114,1,0,7.39,0,0.29,2.22,0.41,0.00,3.289,...,,,,,,,,,,
Ã†RFUGL,33310197,0,0,63.00,13,5.90,37.04,4.15,0.00,50.825,...,,,,,,,,,,
Ã˜ST FRIGG,43576,0,0,21.37,0,0.00,9.22,0.00,0.07,9.290,...,,,,,,,,,,
Ã…SGARD,43765,200,57,331.09,192,105.85,222.14,41.52,17.11,423.988,...,,,,,,,,,,


In [3]:
# Extracting our total oil and gas production values
oil_prod = all_data["Orig. recoverable oil \n[mill Sm3]"]
gas_prod = all_data["Orig. recoverable gas \n[bill Sm3]"]

In [4]:
# Loading our feature data.
data = pd.read_excel("features_with_data.xlsx")
data = data.set_index("Field name")

In [5]:
data

Unnamed: 0_level_0,Orig. inplace ass. liquid \n[mill Sm3],Orig. inplace ass. gas \n[bill Sm3],Orig. inplace free gas \n[bill Sm3],Orig. inplace oil \n[mill Sm3],Orig. recoverable NGL \n[mill tonn],Orig. recoverable cond. \n[mill Sm3],Orig. recoverable oil eq. \n[mill Sm3 o.e],Remaining oil \n[mill Sm3],Remaining gas \n[bill Sm3],Remaining NGL \n[mill tonn],...,Completion logs,# of Wells,Exp start year,Exp start month,Dev start year,Dev start month,Dev end year,Dev end month,Exp Duration,Dev Duration
Field name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ALBUSKJELL,0,64,0.00,56,0.99,0.00,24.761,0.00,0.00,0.00,...,0,26,1972,7,1978,6,1986,6,71,96
ALVE,3,1,13.50,3,1.41,0.00,14.149,0.34,2.10,0.28,...,0,8,1990,5,2009,1,2016,7,224,90
ALVHEIM,0,10,11.69,109,0.00,0.00,61.850,9.31,4.20,0.00,...,0,15,1974,10,2006,5,2019,9,379,160
ATLA,0,0,1.88,0,0.00,0.00,1.800,0.00,0.00,0.00,...,0,5,2010,8,2012,6,2012,6,22,0
BALDER,0,14,0.00,263,0.00,0.00,110.120,36.18,1.24,0.00,...,0,7,1970,4,1996,5,2016,4,313,239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YTTERGRYTA,1,0,7.39,0,0.41,0.00,3.289,0.00,0.00,0.00,...,0,2,2007,5,2008,10,2008,10,17,0
Ãƒâ€ RFUGL,0,0,63.00,13,4.15,0.00,50.825,4.65,31.61,3.52,...,0,4,2000,5,2010,9,2020,3,124,114
ÃƒËœST FRIGG,0,0,21.37,0,0.00,0.07,9.290,0.00,0.00,0.00,...,0,5,1973,8,1987,7,1988,3,167,8
Ãƒâ€¦SGARD,200,57,331.09,192,41.52,17.11,423.988,5.90,32.41,6.46,...,7,10,1981,9,1996,2,2019,7,173,281


In [6]:
# Creating a blank dataframe to store lasso regression scores.
lasso_ranks = pd.DataFrame(columns = ["feature", "oil - score", "gas - score"])

for i in range(23):
    lasso_ranks.loc[i]=0

In [7]:
# create lasso object
lasso = linear_model.Lasso()


# Populate the dataframe with the lasso score regression for both oil and
# gas production for a given feature and save it into the table

counter = 0
for col in data.columns:
    temp_df = pd.DataFrame(data[col])
    lasso.fit(temp_df, oil_prod)
    oil_score = lasso.score(temp_df, oil_prod)
    gas_score = lasso.score(temp_df, gas_prod)
    lasso_ranks.at[counter, "feature"] = col
    
    lasso_ranks.at[counter, "oil - score"] = oil_score
    lasso_ranks.at[counter, "gas - score"] = gas_score
    counter += 1
    

In [8]:
lasso_ranks

Unnamed: 0,feature,oil - score,gas - score
0,Orig. inplace ass. liquid \n[mill Sm3],0.0103208,0.0608303
1,Orig. inplace ass. gas \n[bill Sm3],0.639118,-0.151932
2,Orig. inplace free gas \n[bill Sm3],0.0373797,0.257284
3,Orig. inplace oil \n[mill Sm3],0.952615,-0.038375
4,Orig. recoverable NGL \n[mill tonn],0.27112,0.249315
5,Orig. recoverable cond. \n[mill Sm3],0.00297233,-0.0285745
6,Orig. recoverable oil eq. \n[mill Sm3 o.e],0.539996,0.638428
7,Remaining oil \n[mill Sm3],0.240039,-0.138576
8,Remaining gas \n[bill Sm3],0.055018,0.303367
9,Remaining NGL \n[mill tonn],0.118763,0.297274


In [9]:
# Setting up the empty data frame to contain all the
# different regression scores that we will be using to determine
# our desired features.
feature_ftest = pd.DataFrame(columns = ["feature",
                                        "Lasso - oil",
                                        "f-tests - oil", 
                                        "R^2 - oil",
                                        "Lasso - gas",
                                        "f-tests - gas",
                                        "R^2 - gas"])

for i in range(23):
    feature_ftest.loc[i] = 0

In [10]:
feature_ftest

Unnamed: 0,feature,Lasso - oil,f-tests - oil,R^2 - oil,Lasso - gas,f-tests - gas,R^2 - gas
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0


In [11]:
# Here we will be using modelOLS to get our F-test and our R^2
# here this loop will loop through all the features and feed
# them into the modelOLS object to come up with the desired
# regression results
for i in range(23):
    col = data.columns[i]
    indep_var = all_data[col]
    
    # First train the model for Oil production
    dep_var = all_data["Orig. recoverable oil \n[mill Sm3]"]

    constantAdded = sm.add_constant(indep_var, prepend = True)

    modelOLS = sm.OLS(dep_var, constantAdded)
    
    res = modelOLS.fit()
    
    # Extract the data that we want for a desired feature
    # when fitted for oil production and save it
    feature_ftest.at[i, "feature"] =  col
    feature_ftest.at[i, "f-tests - oil"] = res.f_pvalue
    feature_ftest.at[i, "R^2 - oil"] = res.rsquared
    
    # Then fit the model for gas production
    dep_var = all_data["Orig. recoverable gas \n[bill Sm3]"]
    
    constantAdded = sm.add_constant(indep_var, prepend = True)

    modelOLS = sm.OLS(dep_var, constantAdded)

    res = modelOLS.fit()
    
    # Extract the data that we want for gas production
    # and save it.
    feature_ftest.at[i, "f-tests - gas"] = res.f_pvalue
    feature_ftest.at[i, "R^2 - gas"] = res.rsquared

  return ptp(axis=axis, out=out, **kwargs)


In [12]:
feature_ftest

Unnamed: 0,feature,Lasso - oil,f-tests - oil,R^2 - oil,Lasso - gas,f-tests - gas,R^2 - gas
0,Orig. inplace ass. liquid \n[mill Sm3],0,0.263523,0.0103209,0,8.0166e-09,0.241248
1,Orig. inplace ass. gas \n[bill Sm3],0,1.49799e-28,0.639118,0,0.0465019,0.032355
2,Orig. inplace free gas \n[bill Sm3],0,0.032146,0.0373797,0,6.76864e-98,0.974139
3,Orig. inplace oil \n[mill Sm3],0,5.577850000000001e-82,0.952615,0,9.09352e-05,0.119353
4,Orig. recoverable NGL \n[mill tonn],0,6.67754e-10,0.271123,0,4.52019e-10,0.275714
5,Orig. recoverable cond. \n[mill Sm3],0,0.548934,0.0029767,0,0.00709017,0.05839
6,Orig. recoverable oil eq. \n[mill Sm3 o.e],0,3.8687e-22,0.539996,0,1.76647e-37,0.742774
7,Remaining oil \n[mill Sm3],0,8.84812e-09,0.240039,0,0.946823,3.69158e-05
8,Remaining gas \n[bill Sm3],0,0.00901821,0.055018,0,8.259879999999999e-79,0.94654
9,Remaining NGL \n[mill tonn],0,9.46601e-05,0.118799,0,1.5416999999999998e-21,0.529438


In [13]:
# Lastly as we've already calculated the Lasso regression scores
# just feed them into the table.
for i in range(23):
    feature_ftest.at[i, "Lasso - oil"] = lasso_ranks.at[i, "oil - score"]
    feature_ftest.at[i, "Lasso - gas"] = lasso_ranks.at[i, "gas - score"]

In [14]:
feature_ftest

Unnamed: 0,feature,Lasso - oil,f-tests - oil,R^2 - oil,Lasso - gas,f-tests - gas,R^2 - gas
0,Orig. inplace ass. liquid \n[mill Sm3],0.0103208,0.263523,0.0103209,0.0608303,8.0166e-09,0.241248
1,Orig. inplace ass. gas \n[bill Sm3],0.639118,1.49799e-28,0.639118,-0.151932,0.0465019,0.032355
2,Orig. inplace free gas \n[bill Sm3],0.0373797,0.032146,0.0373797,0.257284,6.76864e-98,0.974139
3,Orig. inplace oil \n[mill Sm3],0.952615,5.577850000000001e-82,0.952615,-0.038375,9.09352e-05,0.119353
4,Orig. recoverable NGL \n[mill tonn],0.27112,6.67754e-10,0.271123,0.249315,4.52019e-10,0.275714
5,Orig. recoverable cond. \n[mill Sm3],0.00297233,0.548934,0.0029767,-0.0285745,0.00709017,0.05839
6,Orig. recoverable oil eq. \n[mill Sm3 o.e],0.539996,3.8687e-22,0.539996,0.638428,1.76647e-37,0.742774
7,Remaining oil \n[mill Sm3],0.240039,8.84812e-09,0.240039,-0.138576,0.946823,3.69158e-05
8,Remaining gas \n[bill Sm3],0.055018,0.00901821,0.055018,0.303367,8.259879999999999e-79,0.94654
9,Remaining NGL \n[mill tonn],0.118763,9.46601e-05,0.118799,0.297274,1.5416999999999998e-21,0.529438


In [15]:
feature_ftest.to_csv("feature_regression_scores.csv")

By the end of this we were able to pick out our selected features for oil production and for gas production. We picked out features based on the 3 different correlation scores, but we mainly relied on the LASSO regression score. 

The features that we selected for oil production were:

- Original inplace oil
- Original inplace gas
- Original recoverable oil
- Development phase duration
- Original recoverable natural gas liquid

The features that we selected for gas production were:

- Original recoverable oil
- Remaining oil
- Remaining gas
- Remaining natural gas liquid
- Original inplace free gas