In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import datetime

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict

In [None]:
seed = 309
random.seed(seed)
np.random.seed(seed)
df = pd.read_csv("diamonds.csv")
df.isnull().values.any()
df = df.drop(df.columns[0], axis = 1)
df.shape

In [None]:
df.describe()

In [None]:
df.corr()

From this we can see that there is a high correlation between x, y, z and carat, while depth and table have a low correlation with the other variables as well as with the price

In [None]:
for column in df.columns:
    print(column, ":", df[column].unique(), "\n")

Below is the processing changing the string values of cut, color and clarity into rankings. The way in which diamonds are valued allowed rankings rather than boolean values for the columns, with the highest rank being 1, the lowest being ~5, 6, 7 dependent on the number of unique values in the column

In [None]:
df['cut']= np.where(df['cut'] == 'Fair', 5 , df['cut'])
df['cut']= np.where(df['cut'] == 'Good', 4 , df['cut'])
df['cut']= np.where(df['cut'] == 'Very Good', 3 , df['cut'])
df['cut']= np.where(df['cut'] == 'Premium', 2 , df['cut'])
df['cut']= np.where(df['cut'] == 'Ideal', 1 , df['cut'])

df['color']= np.where(df['color'] == 'J', 7 , df['color'])
df['color']= np.where(df['color'] == 'I', 6 , df['color'])
df['color']= np.where(df['color'] == 'H', 5 , df['color'])
df['color']= np.where(df['color'] == 'G', 4 , df['color'])
df['color']= np.where(df['color'] == 'F', 3 , df['color'])
df['color']= np.where(df['color'] == 'E', 2 , df['color'])
df['color']= np.where(df['color'] == 'D', 1 , df['color'])

df['clarity']= np.where(df['clarity'] == 'IF', 1 , df['clarity'])
df['clarity']= np.where(df['clarity'] == 'VVS1', 2 , df['clarity'])
df['clarity']= np.where(df['clarity'] == 'VVS2', 3 , df['clarity'])
df['clarity']= np.where(df['clarity'] == 'VS1', 4 , df['clarity'])
df['clarity']= np.where(df['clarity'] == 'VS2', 5 , df['clarity'])
df['clarity']= np.where(df['clarity'] == 'SI1', 6 , df['clarity'])
df['clarity']= np.where(df['clarity'] == 'SI2', 7 , df['clarity'])
df['clarity']= np.where(df['clarity'] == 'I1', 8 , df['clarity'])

df.head()

In [None]:
for column in df.columns:
    print(column, ":", df[column].unique(), "\n")

In [None]:
df.hist(bins = 10, figsize = (14, 10))
plt.show()

The skew on the data, especially on the y and z values, could suggest a need for a log of those datas to get them into a more standard or gaussian distribution. However due to the low range of values in these columns (<60 for y and <30 for z) i decided to leave them in their current form before moving on to the splitting up of the data

#### Below the data is split into the training and test sets and processed


In [None]:
train_data, test_data = train_test_split(df, test_size = 0.3, random_state = seed)
train_data_full = train_data.copy()
train_data = train_data.drop(["price"], axis = 1)
train_labels = train_data_full["price"]

test_data_full = test_data.copy()
test_data = test_data.drop(["price"], axis = 1)
test_labels = test_data_full["price"]

train_data.shape

In [None]:
train_data.head()

In [None]:
train_mean = train_data.mean()
train_std = train_data.std()
train_data = (train_data - train_mean) / train_std
test_data = (test_data - train_mean) / train_std 

## Below is all the regression techniques, code and outputs in order given in the assignment handout 

### Linear Regression

In [None]:
start_time = datetime.datetime.now()

lr_baseline = LinearRegression()
lr_baseline.fit(train_data, train_labels)
#lr_y_pred = lr_baseline.predict(test_data)
lr_y_pred = cross_val_predict(lr_baseline, test_data, test_labels, cv = 10)


end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
print("Coefficients: ", lr_baseline.coef_)
print("Intercept: ", lr_baseline.intercept_)

In [None]:
mse = mean_squared_error(test_labels, lr_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, lr_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, lr_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### KNN Regression

The below code is all in relation to the KNN Regression task, reloading the data frame data into variable names, splitting the data and processing it before passing it through into the KNN evaluator before printing the four performance metrics

In [None]:
#Create the KNN 
start_time = datetime.datetime.now()

knn_baseline = KNeighborsRegressor(n_neighbors = 21, weights='distance')
knn_baseline.fit(train_data, train_labels)
#knn_y_pred = knn_baseline.predict(test_data)
knn_y_pred = cross_val_predict(knn_baseline, test_data, test_labels, cv = 5)

end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, knn_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

knn_r2_error = r2_score(test_labels, knn_y_pred)
print("R2: {error}".format(error=knn_r2_error))

mae = mean_absolute_error(test_labels, knn_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### Ridge's Regression

The following code relates to the Ridges regression problem. It standardises the data before running the machine learning algorithm and prints out the performance metrics at the end

In [None]:
start_time = datetime.datetime.now()

ridge_baseline = Ridge()
ridge_baseline.fit(train_data, train_labels)
#ridge_y_pred = ridge_baseline.predict(test_data)
ridge_y_pred = cross_val_predict(ridge_baseline, test_data, test_labels, cv = 5)

end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, ridge_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, ridge_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, ridge_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### Decision Tree Regression

In [None]:
start_time = datetime.datetime.now()

dt_baseline = DecisionTreeRegressor(min_impurity_decrease = 1.0)
dt_baseline.fit(train_data, train_labels)
#dt_y_pred = dt_baseline.predict(test_data)
dt_y_pred = cross_val_predict(dt_baseline, test_data, test_labels, cv = 5)

end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, dt_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, dt_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, dt_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### Random Forest Example

In [None]:
start_time = datetime.datetime.now()

rf_baseline = RandomForestRegressor(n_estimators = 20,
min_impurity_decrease = 1.0)
rf_baseline.fit(train_data, train_labels)
#rf_y_pred = rf_baseline.predict(test_data)
rf_y_pred = cross_val_predict(rf_baseline, test_data, test_labels, cv = 5)


end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, rf_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, rf_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, dt_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### Gradient Boosting Regression

In [None]:
start_time = datetime.datetime.now()

gb_baseline = GradientBoostingRegressor(learning_rate = 0.2, max_depth = 5, min_impurity_decrease = 0.1)
gb_baseline = gb_baseline.fit(train_data, train_labels)
#gb_y_pred = gb_baseline.predict(test_data)
gb_y_pred = cross_val_predict(gb_baseline, test_data, test_labels, cv = 5)


end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, gb_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, gb_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, gb_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### SGD Regression

In [None]:
start_time = datetime.datetime.now()

#Note, these are the base parameters that are being set to the standard values in order to avoid messages being printed about not having set these parameters and them defaulting to these settings
sgd_baseline = SGDRegressor(tol = None, max_iter = 5)
sgd_baseline = sgd_baseline.fit(train_data, train_labels)
#sgd_y_pred = sgd_baseline.predict(test_data)
sgd_y_pred = cross_val_predict(sgd_baseline, test_data, test_labels, cv = 5)


end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, sgd_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, sgd_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, sgd_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### Support Vector Regression

In [None]:
start_time = datetime.datetime.now()

svr_baseline = SVR(C = 1500)
svr_baseline = svr_baseline.fit(train_data, train_labels)
#svr_y_pred = svr_baseline.predict(test_data)
svr_y_pred = cross_val_predict(svr_baseline, test_data, test_labels, cv = 5)

end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, svr_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, svr_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, svr_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### Linear SVR

In [None]:
start_time = datetime.datetime.now()

l_svr_baseline = LinearSVR(C = 1500)
l_svr_baseline = l_svr_baseline.fit(train_data, train_labels)
#l_svr_y_pred = l_svr_baseline.predict(test_data)
l_svr_y_pred = cross_val_predict(l_svr_baseline, test_data, test_labels, cv = 5)

end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, l_svr_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, l_svr_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, l_svr_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))

### Multilayer Perceptron

In [None]:
start_time = datetime.datetime.now()

mlp_baseline = MLPRegressor(max_iter = 1000, solver = 'lbfgs', momentum = 0.2)
mlp_baseline = mlp_baseline.fit(train_data, train_labels)
#mlp_y_pred = mlp_baseline.predict(test_data)
mlp_y_pred = cross_val_predict(mlp_baseline, test_data, test_labels, cv = 5)

end_time = datetime.datetime.now()
exec_time = (end_time - start_time).total_seconds()

In [None]:
mse = mean_squared_error(test_labels, mlp_y_pred)
print("MSE: {error}".format(error=mse))

print("RMSE: {error}".format(error=np.sqrt(mse)))

r2_error = r2_score(test_labels, mlp_y_pred)
print("R2: {error}".format(error=r2_error))

mae = mean_absolute_error(test_labels, mlp_y_pred)
print("MAE: {error}".format(error=mae))

print("Execution time = {t:.3f} seconds".format(t = exec_time))