# Assignment 1
## Problem 1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

### 1a

In [None]:
df = pd.read_csv("SpotifyFeatures.csv")

num_samples = len(df.axes[0])
num_features = len(df.axes[1])
print("Number of samples =  " + str(num_samples))
print("Number of features = " + str(num_features))

### 1b

In [None]:
df_sub = df.copy()
df_sub = df_sub[(df["genre"] == "Pop") | (df["genre"] == "Classical")]
df_sub["isPop"] = df["genre"].apply(lambda x: 1 if x == "Pop" else 0)

num_songs_Pop = (df_sub["isPop"] == 1).sum()
num_songs_Classical = (df_sub["isPop"] == 0).sum(0)

print("Number of songs: Pop =  " + str(num_songs_Pop))
print("Number of songs: Classical = " + str(num_songs_Classical))

# subset: two features, namely ’liveness’ and ’loudness’
df_sub = df_sub.loc[:, ['genre', 'artist_name', 'track_name', 'track_id', 'liveness', 'loudness', 'isPop']]

### 1c

In [None]:
# make 2 numpy arrays
df_sub_1 = df_sub.loc[:, ['liveness', 'loudness']].to_numpy()
df_sub_2 = df_sub.loc[:, ['isPop']].to_numpy()

# split into training and test data
PERCENTAGE_TRAINING = 0.8

# second array
df_sub_2 = np.insert(df_sub_2, 1, -99, axis=1)

maxPop = int(PERCENTAGE_TRAINING * num_songs_Pop)
maxClassical = int(PERCENTAGE_TRAINING* num_songs_Classical)
countPop = 0
countClassical = 0

## split second array into training and test data (first 80% go into training, rest into test)
for row in df_sub_2:
    if row[0] == 1 and countPop <= maxPop:
        row[1] = 1
        countPop += 1
    elif row[0] == 0 and countClassical <= maxClassical:
        row[1] = 1
        countClassical += 1
    else:
        row[1] = 0 

# based on classification in either trainig or test, split df_sub_2
train_sub_2 = df_sub_2[df_sub_2[:, 1] == 1][:,0]
test_sub_2  = df_sub_2[df_sub_2[:, 1] == 0][:,0]

# split the first array in the same way as before (i.e. based on trainig/test column (=df_sub_2[i, 1] == 1))
train_sub_1 = []
test_sub_1 = []

for i in range(0, len(df_sub_1)):
    if df_sub_2[i, 1] == 1:
        train_sub_1.append(df_sub_1[i, :]) 
    else:
        test_sub_1.append(df_sub_1[i, :])

# turn into numpy array
train_sub_1 = np.array(train_sub_1)
test_sub_1 = np.array(test_sub_1)

### 1d

In [None]:
def plotFig():
    # set colors for groups
    class_colors = {1: 'red', 0: 'blue'}

    # create plot with two axes
    plt.figure(figsize=(10, 8))
    for label, color in reversed(class_colors.items()):
        subset = df_sub[df_sub['isPop'] == label]
        plt.scatter(subset['liveness'], subset['loudness'], label=label, color=color, s=10)

    # label plot
    legend = plt.legend()
    legend.get_texts()[0].set_text('Classical') 
    legend.get_texts()[1].set_text('Pop')  
    plt.title('Liveness vs. Loudness')
    plt.xlabel('Liveness')
    plt.ylabel('Loudness')

    # Save the plot to a file
    plt.savefig('plots/1d.png')
    return plt
plotFig()

## Problem 2

### 2a

In [None]:
# shuffle two data frames in the same way to keep the features and classification on the same row in both dataframes
def shuffle_DFs(A, B):
    perm = np.random.permutation(len(A))
    return A[perm], B[perm]

# source: https://stackoverflow.com/a/4602224

In [None]:
NUM_FEATURES = 2
NUM_EPOCHS = 1000
LEARNING_RATE = 0.01

# logisic function
def sigmoid(x):
    return 1/ (1 + (math.exp(-x)))

# predict value based on intercept and slope coefficients
def predict(sample, intercept, slope):
    yhat = intercept
    for i in range(0, len(sample)):
        yhat += slope[i] * sample[i]
    if (sigmoid(yhat) >= 0.5):  # threshold is 0.5
        return 1
    else: 
        return 0

# calculate error value
def error(df, df_features, intercept, weights):
    error = 0
    for i in range(len(df)):
        error += math.pow((df[i] - predict(df_features[i], intercept, weights)), 2)
    return error

# use predict() on dataframe 
def predictDF(trainDF, trainDF_result, intercept, slope):
    result = []
    for i in range(0, len(trainDF)):
        yhat = predict(trainDF[i, :], intercept, slope)
        result.append(yhat)
    return result
    

def fit(trainDF, trainDF_result):
    errorList = []
    NUM_SAMPLES = len(trainDF)
    # initialize weights to random number
    weights = np.random.rand(NUM_FEATURES) * 0.01
    intercept = 0
    
    for epoch in range(NUM_EPOCHS):
        for i in range(NUM_SAMPLES):
            # compute linear combination of inputs and weights
            linear_output = np.dot(trainDF[i], weights) + intercept
            # compute sigmoid
            y_predicted = sigmoid(linear_output)
            # compute gradient
            gradient = (y_predicted - trainDF_result[i]) * trainDF[i]
            interceptDelta = y_predicted - trainDF_result[i]
            # update weights
            weights -= LEARNING_RATE * gradient
            intercept -= LEARNING_RATE * interceptDelta

        # store error value for plot    
        errorList.append(error(train_sub_2, train_sub_1, intercept, weights))

    return weights, intercept, errorList

# generate shuffeled training data
train_sub_1_shuffle, train_sub_2_shuffle = shuffle_DFs(train_sub_1, train_sub_2)

# get coefficients and errors
weights, intercept, errorList = fit(train_sub_1_shuffle, train_sub_2_shuffle)

# get predicted yhats
predicts_train = predictDF(train_sub_1_shuffle, train_sub_2_shuffle, intercept, weights)

In [None]:
# Create a plot
plt.figure(figsize=(10, 6)) 
plt.plot(p.linspace(1, NUM_EPOCHS, NUM_EPOCHS), errorList, label='error', color='blue')

# Add title and labels
plt.title('Error as a function of epoch with learning rate = '+ str(LEARNING_RATE))
plt.xlabel('Epoch')
plt.ylabel('Error')

# Display the plot
plt.grid(True)

# Save the plot to a file
plt.savefig('plots/2a_01.png')


In [None]:
# calculate arrucacy based on ys and yhats
def accuracy(df, predictDF):
    TP_TN = 0 # since distinction of TP and TN, and FP and FN does not matter for accuracy value, so not distinguish

    for i in range(0, len(df)):
        if (df[i] == 1 and predictDF[i] == 1) or (df[i] == 0 and predictDF[i] == 0):
            TP_TN += 1
    return TP_TN/len(df)

print("Training accuracy: " + str(accuracy(train_sub_2_shuffle, predicts_train)))

## 2b

In [None]:
# shuffle test data sets in the same way to keep rows consistent
test_sub_1_shuffle, test_sub_2_shuffle = shuffle_DFs(test_sub_1, test_sub_2)

# predict values for test data
predicts_test = predictDF(test_sub_1_shuffle, test_sub_2_shuffle,intercept, weights )

print(accuracy(test_sub_2_shuffle, predicts_test))

### 2c

In [None]:
plotFig()

# Create a grid to plot
x_values = np.linspace(test_sub_1_shuffle[:, 0].min(), test_sub_1_shuffle[:, 0].max(), 100)
y_values = (-intercept - weights[0] * x_values) / weights[1]

# Plot the linear line separating
plt.plot(x_values, y_values, color='black', linestyle='--', label='Decision Boundary')

# Save the plot to a file
plt.savefig('plots/2c.png')

# Problem 3

### 3a

In [None]:
# generate confusion matrix, here, distinction between types of (mis-) classification matters
def confusion_matrix(df, predictDF):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(0, len(df)):
        if (df[i] == 1 and predictDF[i] == 1):
            TP += 1
        elif (df[i] == 0 and predictDF[i] == 0):
            TN += 1
        elif (df[i] == 1 and predictDF[i] == 0):
            FN += 1
        else:
            FP += 1
    return [[TP, FN], 
            [FP, TN]]

conf_matr = confusion_matrix(test_sub_2_shuffle, predicts_test)


### 3b

### 3c

In [None]:
# generate dataframe that contains column whether Pop was predicted, yet sample is not Pop
df_suggest = np.column_stack((test_sub_1_shuffle, test_sub_2_shuffle, predicts_test))
df_suggest = pd.DataFrame(df_suggest, columns =  ["liveness", "loudness", "isPop", "isPop_predicted"])

condition1 = df_suggest['isPop'] == 0
condition2 = df_suggest['isPop_predicted'] == 1
df_suggest_sub = df_suggest[condition1 & condition2]

merged_df = pd.merge(df_suggest_sub, df_sub, on = ('liveness', 'loudness', 'isPop'), how = 'inner')


In [None]:
# plot data with third class: Classical songs misclassified as Pop

df_plot = df_sub.copy()
df_plot.loc[df_plot['track_id'].isin(merged_df['track_id']), 'isPop'] = 2

class_colors = {2: 'black', 1: 'red', 0: 'blue'}

plt.figure(figsize=(10, 8))
for label, color in reversed(class_colors.items()):
    subset = df_plot[df_plot['isPop'] == label]
    plt.scatter(subset['liveness'], subset['loudness'], label=label, color=color, s=10, alpha = 0.8)

# label plot
legend = plt.legend()
legend.get_texts()[0].set_text('Classical') 
legend.get_texts()[1].set_text('Pop') 
legend.get_texts()[2].set_text('Classical songs misclassified for Pop')  
plt.title('Liveness vs. Loudness')
plt.xlabel('Liveness')
plt.ylabel('Loudness')

# Save the plot to a file
plt.savefig('plots/3c.png')