In [79]:
import sys
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
import subprocess
import pickle

In [50]:
df = pd.read_csv("/Users/advaitbalaji/se_exercise/AnalysisData.csv", header=0)
vals = df["CumOil12Month"].to_numpy()
first_thresh = np.percentile(vals, 33.33)
second_thresh = np.percentile(vals, 66.66)
print(f"First threshold: {first_thresh} and Second threshold: {second_thresh}")

First threshold: 87317.97175088 and Second threshold: 142840.60089824


In [51]:
category = [0, 1, 2]
CumOilCategory = []
for v in vals:
    if v < first_thresh:
        CumOilCategory.append(category[0])
    elif first_thresh <= v < second_thresh:
        CumOilCategory.append(category[1])
    else:
        CumOilCategory.append(category[2])
df["CumOilCategory"] = CumOilCategory
Counter(df["CumOilCategory"])

Counter({2: 1700, 1: 1699, 0: 1699})

In [52]:
df.drop(columns=["CumOil12Month", "rowID    ", "TotalWellCost_USDMM",
 "CompletionDate", "SurfaceHoleLongitude", "SurfaceHoleLatitude", "BottomHoleLongitude", "BottomHoleLatitude"], inplace=True)


In [53]:
nominal_columns = ['Operator', 'Reservoir']
other_columns =  ['LateralLength_FT', 'ProppantIntensity_LBSPerFT', 'FluidIntensity_BBLPerFT', 
'HzDistanceToNearestOffsetAtDrill', 'HzDistanceToNearestOffsetCurrent', 'VtDistanceToNearestOffsetCurrent', 
'VtDistanceToNearestOffsetAtDrill', 'WellDepth', 'ReservoirThickness', 'OilInPlace', 'Porosity', 
'ReservoirPressure', 'WaterSaturation', 'StructureDerivative', 'TotalOrganicCarbon', 'ClayVolume', 'CarbonateVolume', 'Maturity']
#print(other_columns)
for col in other_columns:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

In [54]:
enc = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
k = enc.fit_transform(df[["Operator", "Reservoir"]])

In [55]:
onehot_df = pd.DataFrame(k, columns=enc.get_feature_names_out(), index=df.index)

In [56]:
df_encoded = pd.concat([df.drop(columns=nominal_columns), onehot_df], axis=1)

In [57]:
scaler = MinMaxScaler()
df_encoded[other_columns] = scaler.fit_transform(df[other_columns])

In [58]:
y = df_encoded["CumOilCategory"]
X = df_encoded.drop(columns=["CumOilCategory"])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

In [59]:
X_train["CumOilCategory"] = y_train
X_test["CumOilCategory"] = y_test

X_train.to_csv("training/train.csv", header=True, index=False, sep=",")
X_test.to_csv("training/test.csv", header=True, index=False, sep=",")

In [76]:
#code_run = code.split("```python")[1].split("```")[0]
code_run = """import os
import sys
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel

# Load train and test data from command line arguments
train_data = pd.read_csv(sys.argv[1])
test_data = pd.read_csv(sys.argv[2])

# Extract features and labels
X_train = train_data.drop('CumOilCategory', axis=1)
y_train = train_data['CumOilCategory']
X_test = test_data.drop('CumOilCategory', axis=1)
y_test = test_data['CumOilCategory']

# Train Random Forest Classifier
print("Training Random Forest Classifier...")
rf = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Predict test data using the trained model
print("Predicting test data...")
y_pred = cross_val_predict(best_rf, X_test, y_test, cv=5)

# Calculate classification report
print("Calculating classification report...")
classification_rep = classification_report(y_test, y_pred, output_dict=True)
classification_df = pd.DataFrame(classification_rep).transpose()

# Calculate feature importances
print("Calculating feature importances...")
best_rf.fit(X_train, y_train)
feature_importances = best_rf.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Select features with importance greater than 0
selected_features = feature_importance_df[feature_importance_df['Importance'] > 0]['Feature']

# Print precision, recall, and F1 score
precision = classification_df.loc['weighted avg', 'precision']
recall = classification_df.loc['weighted avg', 'recall']
f1_score = classification_df.loc['weighted avg', 'f1-score']
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}")

# Save results as a dictionary
results_dict = {
    'classification_report': classification_df,
    'feature_importances': feature_importance_df,
    'selected_features': selected_features
}

# Save results dictionary as a pickle file
with open('results_dict.pkl', 'wb') as f:
    pickle.dump(results_dict, f)"""

In [77]:
file_path = f"generated_code.py"
with open(file_path, 'w') as file:
    file.write(code_run)

In [78]:
def run_script_sys_executable(script_path, train_path, test_path):
    # try:
    #     result = subprocess.run(
    #         [sys.executable, script_path] + list(args),
    #         capture_output=True,
    #         text=True,
    #         check=True
    #     )
    #     return result.stdout
    # except subprocess.CalledProcessError as e:
    #     print(f"An error occurred: {e}")
    #     print(f"Error output: {e.stderr}")
    #     return None

    try:
            # Run the script with the paths to the temporary files
        process = subprocess.Popen(
            [sys.executable, script_path, train_path, test_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            universal_newlines=True
        )
        
        # Stream output in real-time
        for line in process.stdout:
            print(line, end='')  # Print each line as it's generated
        
        # Wait for the process to complete and get the return code
        return_code = process.wait()
        
        # If there was an error, print the error output
        if return_code != 0:
            print("Error occurred. Error output:")
            for line in process.stderr:
                print(line, end='')
            return None
            
        return "Script completed successfully"
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

run_script_sys_executable(file_path, "training/train.csv", "training/test.csv")

Training Random Forest Classifier...
Predicting test data...
Calculating classification report...
Calculating feature importances...
Precision: 0.6244461879176436, Recall: 0.6196078431372549, F1 Score: 0.6216378633621631


'Script completed successfully'

In [104]:
with open("classification_metrics.pkl", "rb") as f:
    res_dict = pickle.load(f)

In [105]:
res_dict.keys()

dict_keys(['svm_classification_report', 'rf_classification_report', 'svm_cross_val_scores', 'rf_cross_val_scores'])

In [120]:
df_rf = res_dict["rf_classification_report"]

In [102]:
df_read = pd.read_csv("/Users/advaitbalaji/se_exercise/AnalysisData.csv", header=0)

In [95]:
df_read.head(5)

Unnamed: 0,SurfaceHoleLongitude,SurfaceHoleLatitude,BottomHoleLongitude,BottomHoleLatitude,Operator,CompletionDate,Reservoir,LateralLength_FT,ProppantIntensity_LBSPerFT,FluidIntensity_BBLPerFT,...,ReservoirPressure,WaterSaturation,StructureDerivative,TotalOrganicCarbon,ClayVolume,CarbonateVolume,Maturity,TotalWellCost_USDMM,CumOil12Month,rowID
0,-101.594663,32.30587,-101.602091,32.335669,FANG,11/6/2015,SPRABERRY LOWER SHALE,9897.0,,,...,1805.9655,0.598,0.013,1.9589,0.214,0.284,0.853,4.8647,114929.0,1001
1,-101.685,32.285148,-101.694277,32.314904,PXD,7/24/2015,WOLFCAMP A,4563.0,1700.0,34.0,...,3724.2295,0.32,0.013,2.2846,0.165,0.369,0.931,3.4619,62404.5195,1002
2,-101.619541,32.353532,-101.611362,32.333854,PXD,11/19/2015,WOLFCAMP B,4833.0,1508.0,37.0,...,4153.2573,0.445,0.013,2.8439,0.17,0.283,0.93,3.5627,124884.8672,1003
3,-101.580465,32.326561,-101.588574,32.354169,OVV,5/9/2017,WOLFCAMP A,4799.0,2577.0,51.0,...,3143.4885,0.327,0.002,2.7256,0.196,0.244,0.932,3.513,98523.5625,1004
4,-101.424883,32.196781,-101.435495,32.237272,OVV,5/12/2017,WOLFCAMP A,5058.0,2467.0,52.0,...,3817.9592,0.36,0.002,2.1426,0.137,0.416,0.932,3.6086,72951.4063,1005


In [103]:
df_read[0]


KeyError: 0

In [121]:
df_rf

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.723898,0.538636,0.752475,0.66902,0.67167,0.67167
recall,0.734118,0.557647,0.715294,0.66902,0.66902,0.66902
f1-score,0.728972,0.547977,0.733414,0.66902,0.670121,0.670121
support,425.0,425.0,425.0,0.66902,1275.0,1275.0


In [122]:
# Rename columns 0, 1, 2 to low, medium, high
df_rf = df_rf.rename(columns={"0": 'low', "1": 'medium', "2": 'high'})

In [123]:
df_rf

Unnamed: 0,low,medium,high,accuracy,macro avg,weighted avg
precision,0.723898,0.538636,0.752475,0.66902,0.67167,0.67167
recall,0.734118,0.557647,0.715294,0.66902,0.66902,0.66902
f1-score,0.728972,0.547977,0.733414,0.66902,0.670121,0.670121
support,425.0,425.0,425.0,0.66902,1275.0,1275.0


In [124]:
rdict = {"rf_res": df_rf}

In [125]:
with open("rf.pkl", "wb") as f:
    pickle.dump(rdict, f)