In [9]:
import pandas as pd

# READ IN CSV AND PARSE DATE COLUMNS
raw_data = pd.read_csv('../rodpump_raw.csv', parse_dates=['lifetime_start','lifetime_end'])

# CREATE NEW COLUMN lifetime WHICH REPRESENTS TOTAL LIFETIME IN DAYS
raw_data['lifetimedays'] = (raw_data['lifetime_end'] - raw_data['lifetime_start']).dt.days

# REMOVE ROWS WHERE failuretype IS NULL
raw_data = raw_data[raw_data['FAILURETYPE'].notnull()]

# REMOVE ROWS WHERE bha_configuration IS NULL
raw_data = raw_data[raw_data['bha_configuration'].notnull()]

# DELETE FAILSTART COLUMN, PER SARAH COFFMAN'S ADVICE
del raw_data['FAILSTART']

# DELETE AVG_WATERSG COLUMN since we know the density of water
del raw_data['AVG_WATERSG']

# DELETE Fillage COLUMN since there are too many nulls (612 nulls out of 2596)
del raw_data['Fillage']

# DELETE NODEID, IDWELL, REPORTTO as redundant fields
del raw_data['NODEID']
del raw_data['IDWELL']
del raw_data['REPORTTO']

# DELETE tbguid, and IDRECJOBPULL as irrelevant identifiers
del raw_data['tbguid']
del raw_data['IDRECJOBPULL']

# DELETE GrossStrokeLength COLUMN since there are too many nulls (1096 nulls out of 2596)
del raw_data['GrossStrokeLength']

# RENAME rod_make TO manufacturer
raw_data.rename(columns = {'rod_make':'manufacturer'}, inplace = True) 

# RENAME UWI TO well
raw_data.rename(columns = {'UWI':'well'}, inplace = True)

In [10]:
raw_data.head()

Unnamed: 0,roduid,well,lifetime_start,lifetime_end,FAILURETYPE,H2S_CONCENTRATION,PrimarySetpoint,SecondarySetpoint,StrokeLength,YesterdaysAverageSPM,...,shallow_max_sideload,max_unguided_sideload,DESANDDEGAS_TYP,CHROME_LENGTH,ENDURALLOY_LENGTH,POLY_LENGTH,NIPPLE_SET_DEPTH,pump_bore,gasanchor_od,lifetimedays
1,GB42ZGOU04727141361583,005-64-9456,2019-07-16,2020-02-13,Tubing,0.0,80.0,65.0,165.878957,6.0,...,174.27,174.27,Miller LLC,0.0,0.0,1167.96,8893.9,2.0,4.5,212
3,GB87DDTZ53468840486615,006-40-5581,2006-02-07,2006-09-22,Sucker Rod Pump,0.0,75.0,60.0,144.0,,...,,,UNKNOWN,0.0,0.0,0.0,9085.2,1.75,,227
4,GB30HELP48302296915492,006-40-5581,2006-09-23,2009-06-25,Sucker Rod Pump,0.0,75.0,60.0,144.0,,...,,,UNKNOWN,0.0,0.0,0.0,9085.2,1.5,,1006
5,GB73EZQN38331541380411,006-57-3389,2017-10-18,2018-05-15,Tubing,0.0,70.0,65.0,165.761084,3.7,...,,,Miller LLC,0.0,0.0,0.0,11505.2,1.25,OtherOrUnknown,209
7,GB71WFOX64096101197026,006-57-3389,2018-05-18,2018-07-18,Sucker Rod Pump,0.0,70.0,65.0,168.259995,3.7,...,,,Miller LLC,0.0,0.0,0.0,11387.1,1.25,OtherOrUnknown,61


In [11]:
# This cell exports the cleaned data frame to the project home directory
# Use wisely, and with caution
raw_data.to_csv('../rodpump_cleaned.csv',index=False,header=True)

# What does each column mean?
liquid volume = rod and pump amount of liquid pulling out of the hole so more stress added to the rod ---> MAIN factor 

stroke length -- how the pump is operating -- before a rod fails it behaves radically *hint*

YesterdaysAverageSPM = yesterday strokes per minute, how pump is operating 

max inclination -- inclination of the well (sometimes the well is not straight) --> a non-straight well might have an impact on rop pump failure due to more friction (but small impact)

H2S is corrosive to pump and harmful to humans

Primary/Secondary Set point = controls how pump is being operated 

bha_configuration = bottom hole assembly configuration - how they have assembled it (Question: does the internal configuration of how you put the sucker rod affect the failure type or lifetime?)

chemical groups - chemical treatments applied during the life of the rod

max_unguided_dls - dog leg severity - how sharply trajectory is. change in degrees per hundred feet. how severe the dog leg is.

AVG_OIL_VOLUME/water/liquid = avg production of oil water and liquid

route - location of well 

sideload data - ask Sarah

DESANDDEGAS_TYP - desand is the company name type. desand takes out the sand after fracking. sand may cause erosion. this is the desander type

rod api grade - how much stress (tensile and compression) it can take 

rod_has_guides and rod_sideload are related -- they can decide to use guides or not 

pump_bore - standard specification --> set up as categorical variables (other column)