In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('AnalysisData.csv')

In [3]:
df.head()

Unnamed: 0,SurfaceLongitude,SurfaceLatitude,BottomHoleLongitude,BottomHoleLatitude,Operator,CompletionDate,Reservoir,LateralLength_FT,ProppantIntensity_LBSPerFT,FluidIntensity_BBLPerFT,...,ReservoirPressure,WaterSaturation,StructureDerivative,TotalOrganicCarbon,ClayVolume,CarbonateVolume,Maturity,TotalWellCost_USDMM,CumOil12Month,rowID
0,-102.415082,32.196651,-102.414886,32.196766,OVV,9/30/2017,WOLFCAMP B,9925.0,2061.0,49.0,...,4150.9728,0.426,0.011,3.1119,0.169,0.257,0.943,5.5592,325341.6875,1
1,-102.358481,32.132822,-102.358359,32.132904,OVV,10/7/2017,SPRABERRY LOWER SHALE,9478.0,2042.0,48.0,...,2927.7631,0.319,0.005,2.8506,0.227,0.144,0.941,5.2283,211278.4063,2
2,-102.402982,32.119352,-102.402824,32.119379,OVV,10/7/2017,SPRABERRY LOWER SHALE,9295.0,2145.0,,...,2230.7591,0.445,0.006,1.9771,0.173,0.177,0.944,5.2176,201117.6094,3
3,-102.373553,32.125898,-102.373553,32.125898,OVV,8/11/2017,WOLFCAMP A,9894.0,1995.0,,...,3267.3561,0.311,0.003,2.5054,0.166,0.338,0.941,5.3567,297255.625,4
4,-102.437947,32.451652,-102.437947,32.451652,OVV,8/16/2017,WOLFCAMP B,10403.0,2007.0,49.0,...,4188.9816,0.386,0.009,3.066,0.162,0.28,0.941,5.6994,240883.1875,5


In [5]:
quantile_90 = df['CumOil12Month'].quantile(0.90)
quantile_95 = df['CumOil12Month'].quantile(0.95)

# Function to categorize the CumOil_12Month
def categorize_oil(oil_amount):
    if oil_amount <= quantile_90:
        return 'Low'
    elif oil_amount <= quantile_95:
        return 'Medium'
    else:
        return 'High'

# Apply the categorization function
df['CumOil_Category'] = df['CumOil12Month'].apply(categorize_oil)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Handle missing data
for column in df.select_dtypes(include=['int64', 'float64']).columns:
    imputer = SimpleImputer(strategy='median')
    df[column] = imputer.fit_transform(df[[column]])

for column in df.select_dtypes(include=['object']).columns:
    imputer = SimpleImputer(strategy='most_frequent')
    df[column] = imputer.fit_transform(df[[column]])

# Encode categorical variables
ordinal_encoder = OrdinalEncoder()
df[df.select_dtypes(include=['object']).columns] = ordinal_encoder.fit_transform(df.select_dtypes(include=['object']))

# Prepare data for training
X = df.drop(['CumOil12Month', 'CumOil_Category', 'rowID'], axis=1)
y = df['CumOil_Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))


Accuracy: 90.33%
              precision    recall  f1-score   support

         0.0       0.62      0.13      0.21        62
         1.0       0.91      0.99      0.95      1101
         2.0       0.22      0.04      0.06        57

    accuracy                           0.90      1220
   macro avg       0.58      0.39      0.41      1220
weighted avg       0.86      0.90      0.87      1220



In [13]:
import joblib

joblib.dump(rf_classifier, 'random_forest_model.pkl')

['random_forest_model.pkl']