In [3]:
import import_ipynb
import basefile as bf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

importing Jupyter notebook from basefile.ipynb


In [4]:
data = pd.read_csv('data/WineQT.csv')
data = data.drop(columns=['Id'])
X = data.drop(columns=['quality'])
y = data['quality']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Multinomial Logistic Regression

In [5]:
learning_rates = [0.1, 0.01, 0.001, 0.0001]
max_iters = [1000, 2000, 5000, 10000]
best_acc = 0
best_lr = None
best_iter = None
for lr in learning_rates:
    for iters in max_iters:
        clf = bf.MultinomialLogisticRegression(learning_rate=lr, max_iter=iters)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        if acc > best_acc:
            best_acc = acc
            best_lr = lr
            best_iter = iters
print("Best accuracy: ", best_acc)
print("Best learning rate: ", best_lr)
print("Best iteration: ", best_iter)

Best accuracy:  0.6666666666666666
Best learning rate:  0.1
Best iteration:  2000


# Decision Tree

In [6]:
critera = ["gini", "entropy", "log_loss"]
max_depths = [1, 5, 10, 100, 500,  1000, 10000]
best_accuracy = 0
best_max_depth = None
best_criterion = None
for c in critera:
    for max_depth in max_depths:
        dtc = DecisionTreeClassifier(max_depth=max_depth, criterion=c ,random_state=42)
        dtc.fit(X_train, y_train)
        y_pred = dtc.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_max_depth = max_depth
            best_criterion = c
print("Best accuracy: ", best_accuracy)
print("Best max depth: ", best_max_depth)
print("Best criterion: ", best_criterion)  

Best accuracy:  0.6374269005847953
Best max depth:  100
Best criterion:  gini


# MLP

In [None]:
class_labels = [[3], [4], [5], [6], [7], [8]]
data = pd.read_csv('wineQT.csv')                                
X = data.drop(['quality'], axis=1)                             
y = data['quality']
encoder = OneHotEncoder(sparse_output=False)                    
y_encoded = encoder.fit_transform(y.values.reshape(-1, 1))
scaler = StandardScaler()                                       
X = scaler.fit_transform(X)
X_train, xtemp, y_train, ytemp = train_test_split(X, y_encoded, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(xtemp, ytemp, test_size=0.4, random_state=42)
learning_rates = [0.01, 0.001, 0.0001]
activations = ['tanh', 'relu', 'sigmoid']
hidden_layers = [[10], [10, 15], [10, 15, 10]]
mse = []

for activation in activations:
    for lr in learning_rates:
        for hidden_layer in hidden_layers:
            config={"learning_rate": lr, "activation": activation, "hidden_layer": hidden_layer, "Optimizer": "SGD"}
            mlp = bf.MLPClassifier(X_train.shape[1], config.hidden_layer, config.learning_rate, config.activation, epoch=2000)
            mlp.train(X_train, y_train)
            mlp.train(X_train, y_train.values.reshape(-1, 1))
            mse.append([config, mlp.mse(X_val, y_val.values.reshape(-1, 1))])

for lr in learning_rates:
    for activation in activations:
        for hidden_layer in hidden_layers:
            config={"learning_rate": lr, "activation": activation, "hidden_layer": hidden_layer, "Optimizer": "Batch"}
            mlp = bf.MLPClassifier(X_train.shape[1], config.hidden_layer, config.learning_rate, config.activation, epoch=2000)
            mlp.trainbatch(X_train, y_train)
            mlp.train(X_train, y_train.values.reshape(-1, 1))
            mse.append([config, mlp.mse(X_val, y_val.values.reshape(-1, 1))])

for lr in learning_rates:
    for activation in activations:
        for hidden_layer in hidden_layers:
            config={"learning_rate": lr, "activation": activation, "hidden_layer": hidden_layer, "Optimizer": "MiniBatch"}
            mlp = bf.MLPClassifier(X_train.shape[1], config.hidden_layer, config.learning_rate, config.activation, epoch=2000)
            mlp.trainminibatch(X_train, y_train)
            mlp.train(X_train, y_train.values.reshape(-1, 1))
            mse.append([config, mlp.mse(X_val, y_val.values.reshape(-1, 1))])

## Best for MLP

In [34]:
clf = bf.MLPClassifier(input_size=X.shape[1], hidden_layers=[10, 5], learning_rate=0.01, activation='sigmoid', epoch = 100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
y_pred = y_pred.reshape(-1,)
print("Best mse: ", np.mean((np.array(y_val) - y_pred)**2))
print("Best activation function: tanh")
print("Best epochs: 10000")
print("Best learning rate: 0.001")
print("Best neurons: 10, 5")
print("Best optimizer: sgd")

Best mse:  82.8746052631579
Best activation function: tanh
Best epochs: 10000
Best learning rate: 0.001
Best neurons: 10, 5
Best optimizer: sgd


# Now for regression

In [8]:
data = pd.read_csv('data/HousingData.csv')
data.fillna(data.mean(), inplace=True)
X = data.drop(columns=['MEDV'])
y = data['MEDV']
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Linear Regression

In [9]:
learning_rates = [0.1, 0.01, 0.001, 0.0001]
iterations = [1000, 10000, 50000, 100000]
best_loss = np.inf
best_learning_rate = None
best_iterations = None
for lr in learning_rates:
    for it in iterations:
        model = bf.LinearRegression(learning_rate=lr, n_iterations=it)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        loss = model.mean_squared_error(y_val, y_pred)
        if loss < best_loss:
            best_loss = loss
            best_learning_rate = lr
            best_iterations = it
print("Best loss: ", best_loss)
print("Best learning rate: ", best_learning_rate)
print("Best iterations: ", best_iterations)

Best loss:  15.408819875945705
Best learning rate:  0.1
Best iterations:  100000


# Decision Tree Regression

In [10]:
max_depths = [1, 20, 50, 100]
criteria = ["squared_error"]#, "friedman_mse", "absolute_error", "poisson"]
best_mse = np.inf
best_depth = None
best_criterion = None
for d in max_depths:
    for c in criteria:
        model = bf.DecisionTreeRegression(max_depth=d, criterion=c)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        if mse < best_mse:
            best_mse = mse
            best_depth = d
            best_criterion = c
print("Best MSE: ", best_mse)   
print("Best depth: ", best_depth)   
print("Best criterion: ", best_criterion)   

Best MSE:  10.629210526315788
Best depth:  50
Best criterion:  squared_error


# MLP Regression

In [None]:
data = pd.read_csv('HousingData.csv')
data.fillna(data.mean(), inplace=True)
X = data.drop(columns=['MEDV'])
y = data['MEDV']
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
learning_rates = [0.01, 0.001, 0.0001]
activations = ['tanh', 'relu', 'sigmoid']
hidden_layers = [[10], [10, 15], [10, 15, 10]]
MSE = []
for activation in activations:
    for lr in learning_rates:
        for hidden_layer in hidden_layers:
            config={"learning_rate": lr, "activation": activation, "hidden_layer": hidden_layer, "Optimizer": "SGD"}
            mlp = bf.MLPRegressor(X_train.shape[1], config.hidden_layer, config.learning_rate, config.activation, epoch = 2000)
            mlp.train(X_train, y_train.values.reshape(-1, 1))
            mse.append([config, mlp.mse(X_val, y_val.values.reshape(-1, 1))])

for lr in learning_rates:
    for activation in activations:
        for hidden_layer in hidden_layers:
            config={"learning_rate": lr, "activation": activation, "hidden_layer": hidden_layer, "Optimizer": "Batch"}
            mlp = bf.MLPRegressor(X_train.shape[1], config.hidden_layer, config.learning_rate, config.activation, epoch = 2000)
            mlp.trainbatch(X_train, y_train.values.reshape(-1, 1))
            mse.append([config, mlp.mse(X_val, y_val.values.reshape(-1, 1))])

for lr in learning_rates:
    for activation in activations:
        for hidden_layer in hidden_layers:
            config={"learning_rate": lr, "activation": activation, "hidden_layer": hidden_layer, "Optimizer": "MiniBatch"}
            mlp = bf.MLPRegressor(X_train.shape[1], config.hidden_layer, config.learning_rate, config.activation, epoch = 2000)
            mlp.trainminibatch(X_train, y_train.values.reshape(-1, 1))
            mse.append([config, mlp.mse(X_val, y_val.values.reshape(-1, 1))])

## Best for MLP regression

In [11]:
model = bf.MLPRegressor(input_size=X.shape[1], hidden_layers=[10, 15], learning_rate=0.01, activation='tanh', epoch = 2000)             
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Squared Error: 35.39985100555057
Root Mean Squared Error: 5.949777391260161
R-squared: 0.5546032363318844
