# Sampling Rig Contractors - KNN

Here we will tackle the problem of contractors not having enough entries to compete with other contractors by simply making it so that we can sample with replacement.

This will help to amplify the results of the smaller contractors

In [1]:
from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Establishing Path

In [2]:
path = "../SHARED/"

## Recompletion

In [3]:
# read in the data
data_rcompl = pd.read_excel(path + "RCOMPL_KNN_IMPUTED.xlsx")

In [4]:
# get all the contractors
unique_rigs = data_rcompl["CONTRACTOR"].unique()

In [5]:
# find out how many entries they have
total_entries = []

for i in unique_rigs:
    temp_pd = data_rcompl[data_rcompl["CONTRACTOR"] == i]
    print(i, " has ", len(temp_pd), " elements.")
    total_entries.append(len(temp_pd))

CONTRACTOR - 2019000027  has  805  elements.
CONTRACTOR - 2019000030  has  403  elements.
CONTRACTOR - 2019000023  has  165  elements.
CONTRACTOR - 2019000008  has  259  elements.
CONTRACTOR - 2019000051  has  22  elements.


In [6]:
total_entries

[805, 403, 165, 259, 22]

In [7]:
# sum the total entries and divide by the number of contractors
# to find out how many entries each contractor must have to be equal
sum_rcompl = 0

for i in total_entries:
    sum_rcompl += i

tot_samples = sum_rcompl // 5

In [8]:
tot_samples

330

In [9]:
# Now taking that sample while having replacement
new_rcompl = pd.DataFrame(columns = data_rcompl.columns)

for i in range(0, len(unique_rigs)):
    temp_pd = data_rcompl[data_rcompl["CONTRACTOR"] == unique_rigs[i]]
    temp_pd = temp_pd.sample(n = tot_samples, replace = True)
    new_rcompl = new_rcompl.append(temp_pd, ignore_index=True)

In [10]:
new_rcompl

Unnamed: 0,UWI,UWI_SIDETRACK,REPORT_START_DATE_YEAR,FRAC_GRADIENT,BREAKDOWN_PRESSURE,HYDROSTATIC_PRESSURE,TREAT_AVG_PRESSURE,TREAT_MAX_PRESSURE,TREAT_MIN_PRESSURE,PROPPANT_DESIGNED,...,DIVERSION_COMPANY,DIVERSION_METHOD,DELIVERY_MODE,BOTTOM_HOLE_PRESSURE_METHOD,CLOSURE_PRESSURE_METHOD,STIMULATION_RESULT,STIMULATION_RESULT_DETAIL,STIMTREAT_ID,FLUID_NAME,SITE_SUPERVISOR
0,2019000708,0,2019,1.0,6241.0,5635.0,8011.0,8338.0,2964.0,218700,...,DIVERSION_COMPANY - 2019000003,Unibeads,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,6A1045B40C5B48B295CA2A65E7B58B3B,Fresh Water,LACHO
1,2019000640,0,2019,1.0,7237.0,5593.0,9782.0,10080.0,7618.0,384000,...,DIVERSION_COMPANY - 2019000007,Frac Plug,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,5D05717248A644059AE6585FBD575AFF,LX-6 (FDP -1298),JAMES CECIL
2,2019000712,0,2018,1.0,8503.0,5612.0,8542.0,8802.0,7218.8,353000,...,DIVERSION_COMPANY - 2019000003,CFP,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,AE34097440FF4B3C9AC41276D56BB96E,20# GUAR BORATE,LACHO
3,2019000298,0,2019,1.0,8487.0,5514.4,9272.0,9562.0,7346.0,328750,...,DIVERSION_COMPANY - 2019000003,Bridge Plug,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,80AB4FF61E0A46C68E2A8C60643281F2,Slickwater,HECTOR RODRIGUEZ
4,2019001382,0,2017,1.0,5269.0,5379.0,5864.0,6828.0,4668.0,24000,...,DIVERSION_COMPANY - 2019000003,Pill,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,D31B0D6DD4E8411EAC8AC8220205503E,Slick Water,HONEYCUTT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1645,2019000278,0,2019,1.0,5335.0,5119.2,8561.0,9179.0,7618.0,367500,...,DIVERSION_COMPANY - 2019000003,Frac Plug,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,ACA7734DDC7D4EB9AA4213B6C7AD9FA8,Slickwater,HECTOR RODRIGUEZ
1646,2019000278,0,2019,1.0,5958.0,5513.2,8483.0,9008.0,7618.0,367500,...,DIVERSION_COMPANY - 2019000003,Frac Plug,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,FFE6FEAE21B44CE6931C11D3BA25DBC5,Slickwater,HECTOR RODRIGUEZ
1647,2019000278,0,2019,1.0,5255.0,5110.0,8442.0,9121.0,7618.0,367500,...,DIVERSION_COMPANY - 2019000003,Frac Plug,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,37D3B14C0BC140EB9122C8A045152A9C,Slickwater,Rene Gomez
1648,2019000278,0,2019,1.0,5335.0,5119.2,8561.0,9179.0,7618.0,367500,...,DIVERSION_COMPANY - 2019000003,Frac Plug,Casing,Calculated,Nolte-Smith Calc,Success,According to Plan,ACA7734DDC7D4EB9AA4213B6C7AD9FA8,Slickwater,HECTOR RODRIGUEZ


In [10]:
# Export data
new_rcompl.to_excel(path + "RCOMPL_RIG_SCALED_REPLACE.xlsx")

## Stimulation Stage

In [11]:
# read in data
data_stmstg = pd.read_excel(path + "STMSTG_KNN_IMPUTED.xlsx")

In [12]:
# Take all the contractors
unique_rigs = data_stmstg["CONTRACTOR"].unique()

In [13]:
# find out how many each contractor has
total_entries = []

for i in unique_rigs:
    temp_pd = data_stmstg[data_stmstg["CONTRACTOR"] == i]
    print(i, " has ", len(temp_pd), " elements.")
    total_entries.append(len(temp_pd))

CONTRACTOR - 2019000008  has  3455  elements.
CONTRACTOR - 2019000027  has  8790  elements.
CONTRACTOR - 2019000030  has  3825  elements.
CONTRACTOR - 2019000023  has  1191  elements.
CONTRACTOR - 2019000051  has  460  elements.


In [5]:
total_entries

[3455, 8790, 3825, 1191, 460]

In [14]:
# sum the total entries and divide by the number of contractors
# to find out how many entries each contractor must have to be equal
sum_stmstg = 0

for i in total_entries:
    sum_stmstg += i

print("total sum", sum_stmstg)
tot_samples = sum_stmstg // 5

total sum 17721


In [15]:
tot_samples

3544

In [16]:
# Now taking that sample while having replacement
new_stmstg = pd.DataFrame(columns = data_stmstg.columns)

for i in range(0, len(unique_rigs)):
    temp_pd = data_stmstg[data_stmstg["CONTRACTOR"] == unique_rigs[i]]
    temp_pd = temp_pd.sample(n = tot_samples, replace = True)
    new_stmstg = new_stmstg.append(temp_pd, ignore_index=True)

In [11]:
# Export the data
new_stmstg.to_excel(path + "STMSTG_RIG_SCALED_REPLACE.xlsx", index = False)