In [187]:
import pandas as pd
import numpy as np

#### Data Reading:

In [226]:
data = pd.read_csv(r"C:\Users\patrick\Documents\Rice\Datathon\training.csv")

In [227]:
data.head()

Unnamed: 0,segment_id,wellbore_chev_no_id,area_id,formation_id,bit_model_id,drillbit_size,min_depth,max_depth,rate_of_penetration,surface_weight_on_bit,surface_rpm
0,wellbore_chev_no_0_727,wellbore_chev_no_0,area_0,formation_62,bit_model_1,12.25,21834.5,21839.5,99.61465,43.53328,150.369091
1,wellbore_chev_no_0_728,wellbore_chev_no_0,area_0,formation_61,bit_model_1,12.25,21840.0,21869.5,99.354021,42.382033,150.369001
2,wellbore_chev_no_0_729,wellbore_chev_no_0,area_0,formation_59,bit_model_1,12.25,21870.0,21899.5,83.622907,48.675129,150.429335
3,wellbore_chev_no_0_730,wellbore_chev_no_0,area_0,formation_65,bit_model_1,12.25,21900.0,21929.5,85.427731,30.04129,150.08
4,wellbore_chev_no_0_731,wellbore_chev_no_0,area_0,formation_65,bit_model_1,12.25,21930.0,21959.5,100.639811,31.406319,150.032167


#### Data Cleaning:

In [228]:
# create dictionaries from unigue entries
wellbores = data.wellbore_chev_no_id.unique()
area = data.area_id.unique()
formation = data.formation_id.unique()
bitmodel = data.bit_model_id.unique()
wellbore_dict = dict(zip(wellbores, range(len(wellbores))))
area_dict = dict(zip(area, range(len(area))))
formation_dict = dict(zip(formation, range(len(formation))))
bitmodel_dict = dict(zip(bitmodel, range(len(bitmodel))))

In [229]:
# map wellbores to unique integers
wellbores_col = data.wellbore_chev_no_id
wellbores_col = wellbores_col.map(wellbore_dict)
data.wellbore_chev_no_id = wellbores_col

In [230]:
# map areas to unique integers
area_col = data.area_id
area_col = area_col.map(area_dict)
data.area_id = area_col

In [231]:
# map formations to unique integers
formation_col = data.formation_id
formation_col = formation_col.map(formation_dict)
data.formation_id = formation_col

In [232]:
# map bitmodels to unique integers
bit_col = data.bit_model_id
bit_col = bit_col.map(bitmodel_dict)
data.bit_model_id = bit_col

In [233]:
# removing outliers
data = data.drop(columns=['segment_id'])
data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]

#### Feature Engineering:

In [234]:
# from discovered formulas, create new features that could have more bearing on ROP
data['rpm*wob'] = data.surface_rpm * data.surface_weight_on_bit
data['wob/bit_size'] = data.surface_weight_on_bit / data.drillbit_size
data['min_depth/bit_size'] = data.min_depth / data.drillbit_size
# data['ratio4'] = np.pi*((data.drillbit_size)/2)**2
# data['ratio5'] = (data.surface_rpm**.75) * (data.surface_weight_on_bit / data.drillbit_size)**.3
# data['ratio6'] = data.surface_rpm / (data.ratio4 - data.surface_weight_on_bit)

In [235]:
data.head()

Unnamed: 0,wellbore_chev_no_id,area_id,formation_id,bit_model_id,drillbit_size,min_depth,max_depth,rate_of_penetration,surface_weight_on_bit,surface_rpm,rpm*wob,wob/bit_size,min_depth/bit_size
0,0,0,0,0,12.25,21834.5,21839.5,99.61465,43.53328,150.369091,6546.059694,3.553737,1782.408163
1,0,0,1,0,12.25,21840.0,21869.5,99.354021,42.382033,150.369001,6372.943939,3.459758,1782.857143
2,0,0,2,0,12.25,21870.0,21899.5,83.622907,48.675129,150.429335,7322.167196,3.97348,1785.306122
3,0,0,3,0,12.25,21900.0,21929.5,85.427731,30.04129,150.08,4508.59685,2.45235,1787.755102
4,0,0,3,0,12.25,21930.0,21959.5,100.639811,31.406319,150.032167,4711.95813,2.563781,1790.204082


In [236]:
## used for sharing files
# data.to_csv(r"C:\Users\patrick\Documents\Rice\Datathon\new_data_ratios.csv")
# data = pd.read_csv(r"C:\Users\patrick\Documents\Rice\Datathon\new_data_ratios.csv")

In [237]:
# correlation
data[data.columns[1:]].corr()['rate_of_penetration'][:]

area_id                 -0.017798
formation_id            -0.287867
bit_model_id            -0.137436
drillbit_size            0.330420
min_depth               -0.447236
max_depth               -0.447215
rate_of_penetration      1.000000
surface_weight_on_bit    0.510900
surface_rpm              0.687476
rpm*wob                  0.661430
wob/bit_size             0.305028
min_depth/bit_size      -0.494811
Name: rate_of_penetration, dtype: float64

### Modeling:

In [238]:
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from scipy import stats

In [259]:
y = data.rate_of_penetration
X = data.drop(columns=['rate_of_penetration','area_id', 'formation_id', 'bit_model_id', 'wellbore_chev_no_id'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [260]:
# outlier removal
# X_train = X_train[(np.abs(stats.zscore(X_train)) < 3).all(axis=1)]
# y_train = y_train[(np.abs(stats.zscore(y_train)) < 3)]

In [261]:
X.head()

Unnamed: 0,drillbit_size,min_depth,max_depth,surface_weight_on_bit,surface_rpm,rpm*wob,wob/bit_size,min_depth/bit_size
0,12.25,21834.5,21839.5,43.53328,150.369091,6546.059694,3.553737,1782.408163
1,12.25,21840.0,21869.5,42.382033,150.369001,6372.943939,3.459758,1782.857143
2,12.25,21870.0,21899.5,48.675129,150.429335,7322.167196,3.97348,1785.306122
3,12.25,21900.0,21929.5,30.04129,150.08,4508.59685,2.45235,1787.755102
4,12.25,21930.0,21959.5,31.406319,150.032167,4711.95813,2.563781,1790.204082


#### Elastic Net:

In [262]:
X, y = make_regression(n_features= 2, random_state=0)
regr = ElasticNet(random_state=0, tol = 0.1)
regr.fit(X_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=0, selection='cyclic', tol=0.1, warm_start=False)

In [263]:
y_pred = regr.predict(X_test)
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Squared Error: 846.3419122764454
Root Mean Squared Error: 29.091956143862955


#### XGBoost

In [243]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
xgbr = xgb.XGBRegressor() 
XGBRegressor(base_score=0.8, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.2, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=350,
       n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)
xgbr.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [244]:
y_pred = xgbr.predict(X_test)
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Squared Error: 410.3002793434001
Root Mean Squared Error: 20.25587024404037


#### Random Forest

In [264]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [265]:
regressor = RandomForestRegressor(n_estimators=500, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [266]:
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Squared Error: 304.04632668224747
Root Mean Squared Error: 17.436924232279253


### Scoring:

In [267]:
y = data.rate_of_penetration
X = data.drop(columns=['rate_of_penetration','area_id', 'formation_id', 'bit_model_id', 'wellbore_chev_no_id'])

In [268]:
regressor.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [269]:
X_test = pd.read_csv(r"C:\Users\patrick\Documents\Rice\Datathon\scoring.csv")
X_test['rpm*wob'] = X_test.surface_rpm * X_test.surface_weight_on_bit
X_test['wob/bit_size'] = X_test.surface_weight_on_bit / X_test.drillbit_size
X_test['min_depth/bit_size'] = X_test.min_depth / X_test.drillbit_size
X_test = X_test.drop(columns=['segment_id', 'area_id', 'formation_id', 'bit_model_id', 'wellbore_chev_no_id'])
y_pred = regressor.predict(X_test)

In [270]:
new = pd.read_csv(r"C:\Users\patrick\Documents\Rice\Datathon\scoring.csv")

In [271]:
new.head()

Unnamed: 0,segment_id,wellbore_chev_no_id,area_id,formation_id,bit_model_id,drillbit_size,min_depth,max_depth,surface_weight_on_bit,surface_rpm
0,wellbore_chev_no_14_665,wellbore_chev_no_14,area_1,formation_220,bit_model_10,18.125,19950.0,19950.0,21.66057,60.655739
1,wellbore_chev_no_14_681,wellbore_chev_no_14,area_1,formation_215,bit_model_10,18.125,20452.0,20459.5,27.768233,160.737708
2,wellbore_chev_no_14_682,wellbore_chev_no_14,area_1,formation_215,bit_model_10,18.125,20460.0,20489.5,28.667844,159.185793
3,wellbore_chev_no_14_683,wellbore_chev_no_14,area_1,formation_213,bit_model_10,18.125,20490.0,20519.5,24.37972,160.027321
4,wellbore_chev_no_14_684,wellbore_chev_no_14,area_1,formation_213,bit_model_10,18.125,20520.0,20549.5,23.551256,151.726782


In [272]:
new2 = new.segment_id

In [277]:
rate_of_penetration = pd.Series(y_pred)

In [279]:
submission = pd.concat([new2, rate_of_penetration], axis=1)

In [283]:
submission = submission.rename(columns = {0:"rate_of_penetration"})

In [284]:
submission.head()

Unnamed: 0,segment_id,rate_of_penetration
0,wellbore_chev_no_14_665,22.679641
1,wellbore_chev_no_14_681,147.616992
2,wellbore_chev_no_14_682,140.189544
3,wellbore_chev_no_14_683,136.862025
4,wellbore_chev_no_14_684,123.792139


In [288]:
submission.to_csv(r"C:\Users\patrick\Documents\Rice\Datathon\submission.csv", index = False)