In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn import metrics
import itertools

from warnings import filterwarnings
filterwarnings('ignore')

In [64]:
# Read dataset
data = pd.read_csv("../../Copper Dataset (version 4.0).csv")

# Remove duplicate columns
tc_data = data.iloc[:,-32:].drop('tensile_strength', axis=1)  # Subset of relevant features and labels
tc_data = tc_data[~tc_data.duplicated(keep='first')]  # Remove duplicates

# Split into features and target variables
X = tc_data.iloc[:,:-1]
y = tc_data['thermal_conductivity']

# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
X_train

# Normalise train and test data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Perform PCA on normalised training data
pca = PCA()
X_train_scaled_pca = pca.fit_transform(X_train_scaled)

# Use Principal Components (0, 1, 6, 14, 20, 15, 19, 10, 7, 28, 5, 29, 17, 25, 16) as training data
best_principal_components = (0, 1, 6, 14, 20, 15, 19, 10, 7, 28, 5, 29, 17, 25, 16)
x_train = X_train_scaled_pca[:, best_principal_components]

# Build model with the following hyperparameters
    # n_neighbours = 5, leaf_size = 1, p = 2, algorithm = 'ball_tree', weights = 'uniform'
model = KNeighborsRegressor(n_neighbors=5, leaf_size = 1, p = 2, algorithm = 'ball_tree', weights = 'uniform')
model.fit(x_train, y_train)

# Normalise test data and project onto the subspace spanned by the Principal Components
X_test_scaled = scaler.transform(X_test)
X_test_scaled_pca = pca.transform(X_test_scaled)
x_test = X_test_scaled_pca[:, best_principal_components]

# Make predictions
test_pred = model.predict(x_test)

# Calculate R-squared on test data predictions
r2_score = metrics.r2_score(y_test, test_pred)
print("R-squared on Test Data = %.4f"%r2_score)

# Calculate mean r-squared with 10 fold cross validation
cv_results = cross_validate(model, x_train, y_train, scoring='r2', cv=10)['test_score']
mean_cv_score = cv_results.mean()
print("Mean R-squared with 10 Fold Cross Validation = %.4f"%mean_cv_score)

R-squared on Test Data = 0.9588
Mean R-squared with 10 Fold Cross Validation = 0.8125
