In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [101]:
# Load the data
training_data = pd.read_csv('Data/training.csv')
training_data.drop(columns=['Unnamed: 0'], inplace=True)
training_data.drop_duplicates(inplace=True)
training_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Before imputing, keep track of rows with missing values for 'OilPeakRate' to remove them later
rows_with_missing_values = training_data[training_data['OilPeakRate'].isnull()].index.to_list()

In [102]:
# Imputing numeric features
numeric_columns = training_data.select_dtypes(include=['float64', 'int64']).columns
numeric_imputer = SimpleImputer(strategy='median')
training_data[numeric_columns] = numeric_imputer.fit_transform(training_data[numeric_columns])

# Imputing categorical features
categorical_columns = training_data.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
training_data[categorical_columns] = categorical_imputer.fit_transform(training_data[categorical_columns])

# Encode categorical variables
training_data = pd.get_dummies(training_data)
print(training_data.shape)

(29682, 46)


In [86]:
# Encode categorical variables
training_data = pd.get_dummies(training_data)
print(training_data.shape)

# Impute missing values with KNNImputer
imputer = KNNImputer(n_neighbors=5)
# Keep column names
training_data_columns = training_data.columns
training_data[training_data_columns] = imputer.fit_transform(training_data)

(29682, 46)


In [103]:
# Drop rows with missing values for 'OilPeakRate'
training_data.drop(rows_with_missing_values, inplace=True)

In [99]:
# Encode categorical variables
training_data = pd.get_dummies(training_data)
print(training_data.shape)

(19306, 44)


In [88]:
# Split the data into training and test sets
X = training_data.drop('OilPeakRate', axis=1)
y = training_data['OilPeakRate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth=5, random_state=123)
model.fit(X_train, y_train)

# Predict the test set labels
y_pred = model.predict(X_test)

# Evaluate the test set RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Test set RMSE: {:.2f}'.format(rmse))

Test set RMSE: 125.02


In [104]:
# Instantiate the RandomForestRegressor
model = RandomForestRegressor(n_estimators=50, random_state=123)
model.fit(X_train, y_train)

# Predict the test set labels
y_pred = model.predict(X_test)

# Evaluate the test set RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Test set RMSE: {:.2f}'.format(rmse))

Test set RMSE: 102.73
