In [5]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn import preprocessing
from sklearn.cluster import KMeans, OPTICS, MeanShift, Birch, DBSCAN
from sklearn.neighbors import KNeighborsClassifier
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.mixture import GaussianMixture
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.metrics import homogeneity_completeness_v_measure, accuracy_score
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score, completeness_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import time
import psutil
np.set_printoptions(precision=3, linewidth=100)
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
#git.Repo.clone_from("https://github.com/petrobras/3W.git", "datasets")

In [3]:
from pathlib import Path

def read_csv_files(folder_path):
    dfs = []  # List to store the DataFrames from each file
    path = Path(folder_path)  # Convert the folder path to a Path object
    for file in path.glob("WELL*.csv"):
        df = pd.read_csv(file)
        dfs.append(df)
    return dfs

def read_datasets(folder_paths):
    all_dfs = []  # List to store DataFrames from all folders
    for folder_path in folder_paths:
        dfs = read_csv_files(folder_path)
        all_dfs.extend(dfs)
    return all_dfs

In [7]:
os.listdir('dataset')

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 'dataset.ini',
 'folds',
 'LICENSE-CC-BY',
 'README.md']

In [8]:
# Specify the folder paths for the datasets you want to read
folder_paths = [
    "dataset/1",
    "dataset/2",
    "dataset/5",
    "dataset/6",
    "dataset/7"
]

# Call the read_datasets function to read the datasets
datasets = read_datasets(folder_paths)
print("Number of datasets:", len(datasets))

Number of datasets: 49


In [9]:
import random

for folder_path in folder_paths:
    dfs = read_csv_files(folder_path)
    if folder_path in folder_paths:
        datasets.extend(dfs)
        

random.shuffle(datasets)
data = pd.concat(datasets, ignore_index=True)

# Print the shape of the shuffled dataset
print("Shape of the dataset:", data.shape)

Shape of the dataset: (3163628, 10)


In [None]:
#data=data.drop('timestamp', axis=1)

In [10]:
data.head()

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2014-02-12 17:03:33.000000,0.0,12804630.0,118.7102,2514857.0,83.49664,4802594.0,,0.0,0.0
1,2014-02-12 17:03:34.000000,0.0,12804800.0,118.7104,2514857.0,83.49661,4802581.0,,0.0,0.0
2,2014-02-12 17:03:35.000000,0.0,12804970.0,118.7106,2514857.0,83.49659,4802568.0,,0.0,0.0
3,2014-02-12 17:03:36.000000,0.0,12805130.0,118.7108,2514856.0,83.49656,4802555.0,,0.0,0.0
4,2014-02-12 17:03:37.000000,0.0,12805300.0,118.711,2514856.0,83.49654,4802542.0,,0.0,0.0


In [13]:
data['class'].value_counts(normalize=True)*100

0.0      39.979729
105.0    22.523666
107.0    20.090686
101.0     6.784655
102.0     4.619421
7.0       1.834860
2.0       1.141911
5.0       0.924239
6.0       0.918565
1.0       0.738838
106.0     0.443430
Name: class, dtype: float64

## Data preperation

### Clean Data: Remove Column with empty values

In [14]:
cleaned_data = data.dropna(axis='columns', how='all')  # Remove columns with missing values
cleaned_data = cleaned_data.dropna(axis=0)  # Remove rows with missing values
cleaned_data.isnull().sum()

timestamp     0
P-PDG         0
P-TPT         0
T-TPT         0
P-MON-CKP     0
T-JUS-CKP     0
P-JUS-CKGL    0
QGL           0
class         0
dtype: int64

In [17]:
cleaned_data['QGL']

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
3163623    0.0
3163624    0.0
3163625    0.0
3163626    0.0
3163627    0.0
Name: QGL, Length: 2585476, dtype: float64

In [None]:
# Preprocess the data
# Assuming the timestamp column is named 'timestamp'
# cleaned_data['timestamp'] = pd.to_datetime(cleaned_data['timestamp'])
# cleaned_data['year'] = cleaned_data['timestamp'].dt.year
# cleaned_data['month'] = cleaned_data['timestamp'].dt.month
# cleaned_data['day'] = cleaned_data['timestamp'].dt.day
# cleaned_data['hour'] = cleaned_data['timestamp'].dt.hour

In [19]:
cleaned_data

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,QGL,class
0,2014-02-12 17:03:33.000000,0.0,12804630.0,118.7102,2514857.0,83.49664,4802594.0,0.0,0.0
1,2014-02-12 17:03:34.000000,0.0,12804800.0,118.7104,2514857.0,83.49661,4802581.0,0.0,0.0
2,2014-02-12 17:03:35.000000,0.0,12804970.0,118.7106,2514857.0,83.49659,4802568.0,0.0,0.0
3,2014-02-12 17:03:36.000000,0.0,12805130.0,118.7108,2514856.0,83.49656,4802555.0,0.0,0.0
4,2014-02-12 17:03:37.000000,0.0,12805300.0,118.7110,2514856.0,83.49654,4802542.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
3163623,2019-04-03 14:59:56.000000,0.0,8489354.0,109.7213,1496222.0,73.38219,9706965.0,0.0,107.0
3163624,2019-04-03 14:59:57.000000,0.0,8489349.0,109.7212,1495828.0,73.38310,9706899.0,0.0,107.0
3163625,2019-04-03 14:59:58.000000,0.0,8489344.0,109.7210,1495433.0,73.38401,9706834.0,0.0,107.0
3163626,2019-04-03 14:59:59.000000,0.0,8489338.0,109.7209,1495039.0,73.38493,9706768.0,0.0,107.0


In [20]:
cleaned_data.drop('timestamp',axis=1,inplace=True)

## Split the data

In [26]:
X = cleaned_data.drop('class', axis =1)
y = cleaned_data['class'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, stratify=y, random_state=2)

In [28]:
# Smote

In [29]:
from imblearn.over_sampling import SMOTE

In [31]:
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [32]:
y_train.value_counts()

0      726638
105    444591
107    396060
101    133921
7       36218
6       17398
5       16896
1       14584
102     13030
106      6245
2        4252
Name: class, dtype: int64

In [37]:
pd.Series(y_train_res).value_counts()

105    726638
107    726638
0      726638
101    726638
1      726638
7      726638
6      726638
102    726638
5      726638
106    726638
2      726638
dtype: int64

## Scaling each feature to a given range: using MinMaxScaler

In [43]:
#minmax

from sklearn.preprocessing import MinMaxScaler

# Create the scaler
min_max_scaler = MinMaxScaler()

# Fit the scaler on the training data and transform it
# columns = [col for col in X_train.columns if col not in ['timestamp', 'class']]

X_train_scaled = min_max_scaler.fit_transform(X_train_res)
X_train_scaled = pd.DataFrame(X_train_scaled,columns=list(X_train_res))

# Transform the test data using the trained scaler
X_test_scaled = min_max_scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled,columns=list(X_train_res))

# Print the head of the scaled training and test data
print("Scaled Training Data:")
print(X_train_scaled.head())
print("\nScaled Test Data:")
print(X_test_scaled.head())

Scaled Training Data:
   P-PDG     P-TPT     T-TPT  P-MON-CKP  T-JUS-CKP  P-JUS-CKGL  QGL
0    1.0  0.002779  0.699886   0.401618   0.086048    0.009932  0.0
1    1.0  0.003009  0.988379   0.383627   0.363920    0.002163  0.0
2    1.0  0.004727  0.992239   0.148482   0.398549    0.000941  0.0
3    1.0  0.002852  0.994855   0.145522   0.425464    0.002818  0.0
4    0.0  0.005793  0.978809   0.993155   0.289398    0.001957  0.0

Scaled Test Data:
   P-PDG     P-TPT     T-TPT  P-MON-CKP  T-JUS-CKP  P-JUS-CKGL  QGL
0    1.0  0.004754  0.978183   0.714802   0.371799    0.000000  0.0
1    1.0  0.004714  0.993799   0.146641   0.399634    0.000932  0.0
2    1.0  0.000843  0.814758   0.195232   0.176125    0.009903  0.0
3    0.0  0.005535  0.989616   0.836518   0.340737    0.000394  0.0
4    1.0  0.001219  0.808937   0.237327   0.174767    0.009901  0.0


In [42]:
# X_train = X_train.drop('timestamp', axis =1)
# X_test = X_test.drop('timestamp', axis=1)

In [48]:
X_train = X_train_scaled.copy()
y_train = y_train_res.copy()
X_test = X_test_scaled.copy()

In [49]:
X_train.shape

(7993018, 7)

In [50]:
y_train.shape

(7993018,)

## Features selection

In [51]:
selector = SelectKBest(score_func=chi2, k=5)
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get the selected feature names
selected_feature_names = X_train.columns[selected_feature_indices]

# Transform the scaled testing set using the selected features
X_test_selected = X_test.iloc[:, selected_feature_indices]

X_train = X_train[selected_feature_names]
X_test = X_test_selected

In [52]:
print("Selected feature names:")
for feature in selected_feature_names:
    print(feature)

Selected feature names:
P-PDG
T-TPT
P-MON-CKP
T-JUS-CKP
P-JUS-CKGL


In [53]:
X_train.head()

Unnamed: 0,P-PDG,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL
0,1.0,0.699886,0.401618,0.086048,0.009932
1,1.0,0.988379,0.383627,0.36392,0.002163
2,1.0,0.992239,0.148482,0.398549,0.000941
3,1.0,0.994855,0.145522,0.425464,0.002818
4,0.0,0.978809,0.993155,0.289398,0.001957


In [54]:
X_test.head()

Unnamed: 0,P-PDG,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL
0,1.0,0.978183,0.714802,0.371799,0.0
1,1.0,0.993799,0.146641,0.399634,0.000932
2,1.0,0.814758,0.195232,0.176125,0.009903
3,0.0,0.989616,0.836518,0.340737,0.000394
4,1.0,0.808937,0.237327,0.174767,0.009901


In [55]:
def get_memory_usage():
    process = psutil.Process()
    return process.memory_info().rss

# Supervised machine learning

### Random Forest Classifier

In [57]:
rfc = RandomForestClassifier(n_jobs=-1,random_state=42)

mem_usage_before_rfc = get_memory_usage()

# Measure time taken
start_time = time.time()
rfc.fit(X_train, y_train)
end_time = time.time()
execution_time_rfc = end_time - start_time

# Measure memory consumption after fitting
mem_usage_after_rfc = get_memory_usage()
mem_usage_rfc = mem_usage_after_rfc - mem_usage_before_rfc

In [None]:
rfc_scores = cross_val_score(rfc, X_train, y_train, cv=5,n_jobs=-1) #evaluation scores for each fold of the cross-validation process.
print(rfc_scores)
print("avrage =", np.mean(rfc_scores))

In [None]:
pred_rfc = rfc.predict(X_test)

print(classification_report(y_test, pred_rfc))

In [None]:
print(accuracy_score(y_test, pred_rfc))

In [None]:
#rfc_disp = ConfusionMatrixDisplay(confusion_matrix(y_test, pred_rfc, labels=rfc.classes_), display_labels=rfc.classes_)
#rfc_disp.plot()
#plt.show()

In [None]:
print("Random Forest Classifier Memory Consumption:", mem_usage_rfc)
print("Random Forest Classifier Execution Time:", execution_time_rfc)

In [None]:
print("No. of Classes in Test data:")
print(y_test.value_counts())
print("----------------------------")
print("No. of Classes in prediction:")
pd.crosstab(y_test, pred_rfc, margins=True)

### KNN :

In [None]:
# Elbow
from tqdm.notebook import tqdm
wcss = []
for k in tqdm(range(2,11)):
    classifier_knn = KNeighborsClassifier(n_neighbors = k)
    classifier_knn.fit(X_train,y_train)
    wcss.append(classifier_knn.inertia_)

In [None]:
plt.plot(range(2,11),wcss)
plt.xlabel('K')
plt.ylabel('Error')

In [None]:
indx = np.array(wcss).argmin()

In [None]:
best_k = list(range(2,11))[indx] + 2

In [None]:
classifier_knn = KNeighborsClassifier(n_neighbors = best_k)

mem_usage_before_knn = get_memory_usage()

# Measure time taken
start_time = time.time()
classifier_knn.fit(X_train, y_train)
end_time = time.time()
execution_time_knn = end_time - start_time

# Measure memory consumption after fitting
mem_usage_after_knn = get_memory_usage()
mem_usage_knn = mem_usage_after_knn - mem_usage_before_knn

pred_knn = classifier_knn.predict(X_test)

In [None]:
knn_scores = cross_val_score(classifier_knn, X_train, y_train, cv=5)
print(knn_scores)
print("avrage =", np.mean(knn_scores))

In [None]:
print(classification_report(y_test, pred_knn))

In [None]:
#knn_disp = ConfusionMatrixDisplay(confusion_matrix(y_test, pred_knn, labels=classifier_knn.classes_), display_labels=classifier_knn.classes_)
#knn_disp.plot()
#plt.show()

In [None]:
print("KNN Memory Consumption:", mem_usage_knn)
print("KNN Execution Time:", execution_time_knn)

In [None]:
print("No. of Classes in Test data:")
print(y_test.value_counts())
print("----------------------------")
print("No. of Classes in prediction:")
pd.crosstab(y_test, pred_knn, margins=True)

### Decision Trees

In [None]:
dt = DecisionTreeClassifier(n_jobs=-1,random_state=42)

mem_usage_before_dt = get_memory_usage()

# Measure time taken
start_time = time.time()
dt.fit(X_train, y_train)
end_time = time.time()
execution_time_dt = end_time - start_time

# Measure memory consumption after fitting
mem_usage_after_dt = get_memory_usage()
mem_usage_dt = mem_usage_after_dt - mem_usage_before_dt

pred_dt = dt.predict(X_test)

In [None]:
dt_scores = cross_val_score(dt, X_train, y_train, cv=5)
print(dt_scores)
print("avrage =", np.mean(dt_scores))

In [None]:
print(classification_report(y_test, pred_dt, zero_division=0))

In [None]:
print(accuracy_score(y_test, pred_dt))

In [None]:
print("Decision Trees Memory Consumption:", mem_usage_dt)
print("Decision Trees Execution Time:", execution_time_dt)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

mem_usage_before_nb = get_memory_usage()

# Measure time taken
start_time = time.time()
nb.fit(X_train, y_train)
end_time = time.time()
execution_time_nb = end_time - start_time

# Measure memory consumption after fitting
mem_usage_after_nb = get_memory_usage()
mem_usage_nb = mem_usage_after_nb - mem_usage_before_nb


pred_nb = nb.predict(X_test)

In [None]:
nb_scores = cross_val_score(nb, X_train, y_train, cv=5)
print(nb_scores)
print("avrage =", np.mean(nb_scores))

In [None]:
print(classification_report(y_test, pred_nb, zero_division=0))

In [None]:
print(accuracy_score(y_test, pred_nb))

In [None]:
print("Naive Bayes Memory Consumption:", mem_usage_nb)
print("Naive Bayes Execution Time:", execution_time_nb)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
logreg = LogisticRegression()

mem_usage_before_logreg = get_memory_usage()

# Measure time taken
start_time = time.time()
logreg.fit(X_train, y_train)
end_time = time.time()
execution_time_logreg = end_time - start_time

# Measure memory consumption after fitting
mem_usage_after_logreg = get_memory_usage()
mem_usage_logreg = mem_usage_after_logreg - mem_usage_before_logreg

pred_logreg = logreg.predict(X_test)

In [None]:
logreg_scores = cross_val_score(logreg, X_train, y_train, cv=5)
print(logreg_scores)
print("avrage =", np.mean(logreg_scores))

In [None]:
print(classification_report(y_test, pred_logreg, zero_division=0))

In [None]:
print(accuracy_score(y_test, pred_logreg))

In [None]:
print("Logistic Regression  Memory Consumption:", mem_usage_logreg)
print("Logistic Regression  Execution Time:", execution_time_logreg)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create subplots with 1 row and 5 columns
fig, axs = plt.subplots(1, 3, figsize=(10, 4))

# Plot for rf_predictions
sns.distplot((y_test - pred_rfc), ax=axs[0])
axs[0].set_title('RF Residuals')

# Plot for knn_predictions
sns.distplot((y_test - pred_knn), ax=axs[1])
axs[1].set_title('KNN Residuals')

# Plot for dt_predictions
sns.distplot((y_test - pred_dt), ax=axs[2])
axs[2].set_title('DT Residuals')

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(1, 3, figsize=(10, 4))
# Plot for nb_predictions
sns.distplot((y_test - pred_nb), ax=axs[0])
axs[0].set_title('NB Residuals')

# Plot for logreg_predictions
sns.distplot((y_test - pred_logreg), ax=axs[1])
axs[1].set_title('Logistic Regression Residuals')

# Adjust layout and display the plots
plt.tight_layout()
plt.show()


### MLPC

In [None]:
mlpc = MLPClassifier()
mem_usage_before_mlpc = get_memory_usage()

# Measure time taken
start_time = time.time()
mlpc.fit(X_train, y_train)
end_time = time.time()
execution_time_mlpc = end_time - start_time

# Measure memory consumption after fitting
mem_usage_after_mlpc = get_memory_usage()
mem_usage_mlpc = mem_usage_after_mlpc - mem_usage_before_mlpc
pred_mlpc = mlpc.predict(X_test)

In [None]:
print(classification_report(y_test, pred_mlpc, zero_division=0))

In [None]:
print("MLPC Memory Consumption:", mem_usage_mlpc)
print("MLPC Execution Time:", execution_time_mlpc)

In [None]:
print("No. of Classes in Test data:")
print(cleaned_data['class'].value_counts())
print("----------------------------")
print("No. of Classes in prediction:")
pd.crosstab(y_test, pred_mlpc, margins=True)

### SVM

In [None]:
svm = svm.SVC()

mem_usage_before_svm = get_memory_usage()

# Measure time taken
start_time = time.time()
svm.fit(X_train, y_train)
end_time = time.time()
execution_time_svm = end_time - start_time

# Measure memory consumption after fitting
mem_usage_after_svm = get_memory_usage()
mem_usage_svm = mem_usage_after_svm - mem_usage_before_svm

pred_svm = svm.predict(X_test)


In [None]:
print(classification_report(y_test, pred_svc, zero_division=0))

In [None]:
print(accuracy_score(y_test, pred_svc))

In [None]:
print("SVM Memory Consumption:", mem_usage_svm)
print("SVM Execution Time:", execution_time_svm)