In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

In [17]:
df = pd.read_csv('data_2genre.csv')
df.iloc[:, -1] = df.iloc[:,-1].replace({2:0})

In [18]:
X = df.iloc[:, 1:-1]
X = normalize(X)
y = df.iloc[:, -1] 

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) 
X_train, X_test, y_train, y_test = X_train.T, X_test.T, y_train.values.reshape(1, 150), y_test.values.reshape(1, 50)

In [20]:
learning_rate = 0.5
num_iter = 10000

In [21]:
def sigmoid(z):
    return (1 / (1 + np.exp(-z)))

In [22]:
def train(X_train, y_train, num_iter=10000, learning_rate=0.5, hidden_layer_num=15):
    
    n = X_train.shape[0]
    m = X_train.shape[1]
    w1 = np.random.randn(hidden_layer_num, n) * 0.01 #
    b1 = np.random.randn(hidden_layer_num, 1)
    w2 = np.random.randn(1, hidden_layer_num) * 0.01 #
    b2 = np.random.randn(1, 1)
    for i in range(num_iter):
        Z1 = np.dot(w1, X_train) + b1
        A1 = sigmoid(Z1) 
        Z2 = np.dot(w2, A1) + b2
        A2 = sigmoid(Z2) 
        cost = np.sum((y_train * np.log(A2)) + ((1-y_train) * np.log(1-A2))) * (-1/m)

        dZ2 = A2 - y_train
        dW2 = np.dot(dZ2, A1.T) / m  #######
        db2 = np.sum(dZ2, axis=1, keepdims=True) * (1 / m)

        dA1 = np.dot(w2.T, dZ2)
        dZ1 = dA1 * sigmoid(Z1) * (1 - sigmoid(Z1))
        dW1 = np.dot(dZ1, X_train.T) / m #######
        db1 = np.sum(dZ1, axis=1, keepdims=True) * (1 / m)

        w2 = w2 - learning_rate * dW2
        b2 = b2 - learning_rate * db2
        w1 = w1 - learning_rate * dW1
        b1 = b1 - learning_rate * db1

        if(i %1000 == 0):
            print(f'At epoch {i}, cost is {cost}')
    return [w1, w2, b1, b2]

In [23]:
params = train(X_train, y_train,num_iter=20000, learning_rate=1, hidden_layer_num=17)

At epoch 0, cost is 0.6983904061039693
At epoch 1000, cost is 0.15380232514844563
At epoch 2000, cost is 0.07617707116453971
At epoch 3000, cost is 0.058306355223167676
At epoch 4000, cost is 0.05098309807245595
At epoch 5000, cost is 0.047083470152077644
At epoch 6000, cost is 0.04468171702157733
At epoch 7000, cost is 0.04304304330866654
At epoch 8000, cost is 0.041835867344083344
At epoch 9000, cost is 0.04089230873007523
At epoch 10000, cost is 0.04011943129680138
At epoch 11000, cost is 0.039462229836810206
At epoch 12000, cost is 0.03888647812064723
At epoch 13000, cost is 0.03837003595225454
At epoch 14000, cost is 0.03789811966760169
At epoch 15000, cost is 0.03746058335594751
At epoch 16000, cost is 0.03705028752401399
At epoch 17000, cost is 0.036662086793999175
At epoch 18000, cost is 0.036292184800545496
At epoch 19000, cost is 0.03593771423942997


In [27]:
def score(X, y, w1, w2, b1, b2):
    Z1 = np.dot(w1, X) + b1
    A1 = sigmoid(Z1)
    Z2 = np.dot(w2, A1) + b2
    A2 = sigmoid(Z2)
    
    # Convert predicted probabilities to binary predictions (0 or 1)
    Y_prediction = (A2 > 0.5).astype(float)
    
    # Compute RMSE
    rmse = np.sqrt(np.mean((Y_prediction - y)**2))
    
    print("RMSE: {:.4f}".format(rmse))
    
    # Optionally, you can print other classification metrics
    accuracy = 100 - np.mean(np.abs(Y_prediction - y)) * 100
    print("Accuracy: {:.2f}%".format(accuracy))

# Evaluate on the training set
score(X_train, y_train, *params)
score(X_test, y_test, *params)

RMSE: 0.1155
Accuracy: 98.67%
RMSE: 0.1414
Accuracy: 98.00%


In [2]:
#Logistics
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Read data and preprocess
df = pd.read_csv('data_2genre.csv')
df.iloc[:, -1] = df.iloc[:, -1].replace({2: 0})
X = df.iloc[:, 1:-1]
X = normalize(X)
y = df.iloc[:, -1]

# Specify the test size as 25%
test_size = 0.25

# Split data into training and testing sets with the specified test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

# Initialize Logistic Regression model
logreg_model = LogisticRegression()

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions on the training and testing sets
y_train_pred = logreg_model.predict(X_train)
y_test_pred = logreg_model.predict(X_test)

# Compute accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print accuracies
print("Training Accuracy: {:.2f}%".format(train_accuracy * 100))
print("Testing Accuracy: {:.2f}%".format(test_accuracy * 100))


Training Accuracy: 92.67%
Testing Accuracy: 96.00%


In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Read data and preprocess
df = pd.read_csv('data_2genre.csv')
df.iloc[:, -1] = df.iloc[:, -1].replace({2: 0})
X = df.iloc[:, 1:-1]
X = normalize(X)
y = df.iloc[:, -1]

# Initialize Linear Regression model with cross-validation
linear_model = LinearRegression()

# Perform cross-validation
cv_predictions = cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_squared_error')  # Use neg_mean_squared_error for RMSE

# Compute RMSE
rmse_scores = np.sqrt(-cv_predictions)

# Perform cross-validation
cv_scores = cross_val_score(logreg_model, X, y, cv=5)  # adjust the number of folds (cv)
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy: {:.2f}%".format(np.mean(cv_scores) * 100))


# Print RMSE scores and mean RMSE
print("Cross-validation RMSE Scores:", rmse_scores)
print("Mean RMSE: {:.2f}".format(np.mean(rmse_scores)))


Cross-validation Scores: [0.9   0.95  0.925 0.975 0.925]
Mean CV Accuracy: 93.50%
Cross-validation RMSE Scores: [0.10837562 0.17920631 0.14065937 0.20159851 0.21436923]
Mean RMSE: 0.17


# Multiclass 

In [27]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

In [28]:
data = pd.read_csv('C:/Users/katyc/Downloads/data.csv')

In [29]:
X = data.iloc[:, 1:-1]
X = normalize(X)

y = data.iloc[:, -1] 
y_labelled = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_labelled, random_state = 0) 
X_train, X_test, y_train, y_test = X_train.T, X_test.T, y_train.reshape(1, 750), y_test.reshape(1, 250)


yone_train = OneHotEncoder(sparse=False).fit_transform(y_train.reshape(-1, 1), y=None)
yone_train = yone_train.T

yone_test = OneHotEncoder(sparse=False).fit_transform(y_test.reshape(-1, 1), y=None)
yone_test = yone_test.T



In [30]:
def train_multi(X_train, y_train, num_iter=10000, learning_rate=0.5, hidden_layer_num=15):
    
    n = X_train.shape[0]
    m = y_train.shape[1]
    w1 = np.random.randn(hidden_layer_num, n) * 0.01 #
    b1 = np.random.randn(hidden_layer_num, 1)
    w2 = np.random.randn(10, hidden_layer_num) * 0.01 #
    b2 = np.random.randn(10, 1)
    for i in range(num_iter):
        Z1 = np.dot(w1, X_train) + b1
        A1 = sigmoid(Z1) 
        Z2 = np.dot(w2, A1) + b2
        A2 = np.exp(Z2) / np.sum(np.exp(Z2), axis=0) # SOFTMAX
        cost = np.sum(y_train * np.log(A2)) * (-1/m)

        dZ2 = A2 - y_train
        dW2 = np.dot(dZ2, A1.T) / m  #######
        db2 = np.sum(dZ2, axis=1, keepdims=True) * (1 / m)

        dA1 = np.dot(w2.T, dZ2)
        dZ1 = dA1 * sigmoid(Z1) * (1 - sigmoid(Z1))
        dW1 = np.dot(dZ1, X_train.T) / m #######
        db1 = np.sum(dZ1, axis=1, keepdims=True) * (1 / m)

        w2 = w2 - learning_rate * dW2
        b2 = b2 - learning_rate * db2
        w1 = w1 - learning_rate * dW1
        b1 = b1 - learning_rate * db1

        if(i %1000 == 0):
            print(f'At epoch {i}, cost is {cost}')
    return [w1, w2, b1, b2]

In [36]:
def score_multi(X, y, w1, w2, b1, b2):
    # y is one-hot-encoded
    Z1 = np.dot(w1, X) + b1
    A1 = sigmoid(Z1) 
    Z2 = np.dot(w2, A1) + b2
    A2 = np.exp(Z2) / np.sum(np.exp(Z2), axis=0) # SOFTMAX
    #I changed 750 to 250
    Y_predictions = np.argmax(A2, axis=0).reshape(250,1)
    labels = np.argmax(y, axis=0)
    print(classification_report(Y_predictions, labels))    

In [32]:
params = train_multi(X_train, yone_train,num_iter=200000, learning_rate=0.5, hidden_layer_num=18)

At epoch 0, cost is 2.9123970666191936
At epoch 1000, cost is 2.299150528194198
At epoch 2000, cost is 2.2624535858579606
At epoch 3000, cost is 2.067579057677499
At epoch 4000, cost is 1.98066823854925
At epoch 5000, cost is 1.9520329847344613
At epoch 6000, cost is 1.9371226828942951
At epoch 7000, cost is 1.9248022793468809
At epoch 8000, cost is 1.909937134230339
At epoch 9000, cost is 1.8863354662226228
At epoch 10000, cost is 1.849141818319388
At epoch 11000, cost is 1.8087599109770076
At epoch 12000, cost is 1.780790627471398
At epoch 13000, cost is 1.7647781758077847
At epoch 14000, cost is 1.7551395713603832
At epoch 15000, cost is 1.7484316606168613
At epoch 16000, cost is 1.7430732283139017
At epoch 17000, cost is 1.7383712172985475
At epoch 18000, cost is 1.7340089573329223
At epoch 19000, cost is 1.7298879668061395
At epoch 20000, cost is 1.7378044649182587
At epoch 21000, cost is 1.734022911520261
At epoch 22000, cost is 1.7306847731586281
At epoch 23000, cost is 1.727264

At epoch 190000, cost is 1.304122511509458
At epoch 191000, cost is 1.2982116861845436
At epoch 192000, cost is 1.3071358559739785
At epoch 193000, cost is 1.3410476883144031
At epoch 194000, cost is 1.2844764137923648
At epoch 195000, cost is 1.2911986456814366
At epoch 196000, cost is 1.288914030377447
At epoch 197000, cost is 1.2916614704730693
At epoch 198000, cost is 1.3007074619613657
At epoch 199000, cost is 1.330189866664336


In [37]:
score_multi(X_test, yone_test, *params)

              precision    recall  f1-score   support

           0       0.52      0.55      0.54        20
           1       0.88      0.70      0.78        20
           2       0.23      0.58      0.33        12
           3       0.42      0.44      0.43        25
           4       0.14      0.40      0.21        10
           5       0.15      0.67      0.25         6
           6       0.89      0.63      0.74        38
           7       0.81      0.43      0.56        49
           8       0.60      0.19      0.29        63
           9       0.03      0.14      0.05         7

    accuracy                           0.44       250
   macro avg       0.47      0.47      0.42       250
weighted avg       0.62      0.44      0.48       250



In [34]:
score_multi(X_train, yone_train, *params)

              precision    recall  f1-score   support

           0       0.37      0.57      0.45        51
           1       0.95      0.84      0.89        95
           2       0.14      0.56      0.23        18
           3       0.43      0.47      0.45        68
           4       0.35      0.37      0.36        67
           5       0.19      0.70      0.30        20
           6       0.86      0.55      0.67       114
           7       0.85      0.54      0.66       116
           8       0.80      0.36      0.50       177
           9       0.11      0.33      0.17        24

    accuracy                           0.52       750
   macro avg       0.51      0.53      0.47       750
weighted avg       0.68      0.52      0.56       750

