In [1]:
import keras.layers
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, StandardScaler, MinMaxScaler


In [2]:
TARGET_COLUMN = "Label"


In [3]:
def load_file() -> pd.DataFrame:
    df = pd.read_csv("./dataset.csv", encoding="ISO-8859-1", header=0)
    return df

In [4]:
def remove_comma(df):
    """
    Iterate through all cells in a DataFrame and replace commas with dots.

    Parameters:
    df (pd.DataFrame): The DataFrame to be modified.

    Returns:
    pd.DataFrame: The modified DataFrame with commas replaced by dots.
    """

    # Define a function to replace commas with dots
    def replace_comma(value):
        if isinstance(value, str):
            return value.replace(',', '')
        return value

    # Apply the function to the entire DataFrame
    df = df.map(replace_comma)

    return df

In [33]:
def add_target_column(df: pd.DataFrame) -> pd.DataFrame:
    bins = np.arange(0, 2000, 100)
    print(bins)
    df["All Time Rank"] = df["All Time Rank"].apply(lambda x: float(x.replace(",", ".")))
    df[TARGET_COLUMN] = np.digitize(df["All Time Rank"], bins, right=True)

    return df

In [6]:
def stratified_split_dataframe(df, target_column, train_size=0.5, validation_size=0.15, test_size=0.35,
                               random_state=42):
    """
    Split a DataFrame into stratified training, validation, and test sets.

    Parameters:
    df (pd.DataFrame): The DataFrame to be split.
    target_column (str): The name of the target column for stratification.
    train_size (float): The proportion of the data to include in the train split. Default is 0.7.
    validation_size (float): The proportion of the data to include in the validation split. Default is 0.15.
    test_size (float): The proportion of the data to include in the test split. Default is 0.15.
    random_state (int, optional): Random seed for reproducibility. Default is None.

    Returns:
    tuple: A tuple containing three DataFrames (train, validation, test).
    """
    if train_size + validation_size + test_size != 1.0:
        raise ValueError("train_size, validation_size, and test_size must sum to 1.0")

    # Split the data into train and temp sets
    sss_train_temp = StratifiedShuffleSplit(n_splits=1, test_size=(1.0 - train_size), random_state=random_state)
    train_index, temp_index = next(sss_train_temp.split(df, df[target_column]))
    train_df = df.iloc[train_index]
    temp_df = df.iloc[temp_index]

    # Calculate the proportion of validation and test relative to the temp set
    temp_size = validation_size + test_size
    validation_proportion = validation_size / temp_size

    # Split the temp set into validation and test sets
    sss_validation_test = StratifiedShuffleSplit(n_splits=1, test_size=validation_proportion, random_state=random_state)
    validation_index, test_index = next(sss_validation_test.split(temp_df, temp_df[target_column]))
    validation_df = temp_df.iloc[validation_index]
    test_df = temp_df.iloc[test_index]

    return train_df, validation_df, test_df

In [7]:
ordinal_columns = ['Album Name', 'Artist']  # Replace with your ordinal columns
date_columns = ['Release Date']  # Replace with your date columns
drop_columns = ['ISRC', 'TIDAL Popularity']  # Replace with your columns to drop
drop_na_columns = ["Artist"]

stream_columns = ['Spotify Streams', 'YouTube Views', 'TikTok Views', 'Pandora Streams', 'Soundcloud Streams', ]


def parse_dates(column):
    return pd.to_datetime(column, format='%m/%d/%Y', errors='coerce').astype(np.int64) // 10 ** 9


def transformation_pipeline(df):
    # Convert all columns to numbers if possible
    df = df.apply(pd.to_numeric, errors='ignore')

    # Replace nulls in numerical columns with 0
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col].fillna(0, inplace=True)

    # Convert specified string columns to ordinals
    if ordinal_columns:
        ordinal_encoder = OrdinalEncoder()
        df[ordinal_columns] = ordinal_encoder.fit_transform(df[ordinal_columns])

    # Parse date columns into Unix timestamp
    for col in date_columns:
        df[col] = parse_dates(df.loc[:, col])

    # Drop specified columns
    df.drop(columns=drop_columns, inplace=True)
    df.dropna(subset=drop_na_columns, inplace=True)

    df["Total Streams"] = df[stream_columns].sum(axis=1)

    return df

In [8]:

def split_features_target(df: pd.DataFrame, target_column: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    X = df.drop(columns=[target_column], inplace=False)
    Y = df.loc[:, target_column]

    return X, Y

In [9]:
def remove_id(df):
    return df.drop(columns=['Track'])

In [10]:
from sklearn.manifold import LocallyLinearEmbedding

scaler = StandardScaler()
normalizer = MinMaxScaler()
# exclude_columns = ['Album Name', 'Artist', 'Track']
exclude_columns = ['Track']
pca = PCA(n_components=2)
lle = LocallyLinearEmbedding(n_components=3, n_neighbors=10)


def scale_data(df, train=False):
    columns = [column for column in df.columns if column not in exclude_columns]
    if train:
        df.loc[:, columns] = scaler.fit_transform(df[columns])
        df.loc[:, columns] = normalizer.fit_transform(df[columns])
    #     # df.loc[:, columns] = lle.fit_transform(df[columns])
    else:
        df.loc[:, columns] = scaler.transform(df[columns])
        df.loc[:, columns] = normalizer.transform(df[columns])
    #     # df.loc[:, columns] = lle.transform(df[columns])

In [34]:
df = load_file()
df = df[:2000]
df = remove_comma(df)
df = add_target_column(df)
df = transformation_pipeline(df)

train, validation, test = stratified_split_dataframe(df, target_column=TARGET_COLUMN)
X_train, Y_train = split_features_target(train, TARGET_COLUMN)
X_valid, Y_valid = split_features_target(validation, TARGET_COLUMN)
X_test, Y_test = split_features_target(test, TARGET_COLUMN)

scale_data(X_train, train=True)
scale_data(X_valid)
scale_data(X_test)


[   0  100  200  300  400  500  600  700  800  900 1000 1100 1200 1300
 1400 1500 1600 1700 1800 1900]
99     1
100    2
101    2
102    2
103    2
      ..
195    2
196    2
197    2
198    2
199    2
Name: Label, Length: 101, dtype: int64
[ 0 50 50 50 50 51 51 50 50 50 50 50 51 50 49 50 49 50 51 50 46]


In [35]:
def print_nan_locations(df):
    """
    Print the columns and rows that have NaN values in a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to be checked for NaN values.
    """
    nan_locations = df.isna()

    # Iterate over the DataFrame to find NaN values
    for column in nan_locations.columns:
        nan_rows = nan_locations[column]
        if nan_rows.any():
            print(f"Column '{column}' has NaN values in rows: {list(nan_rows[nan_rows].index)}")


In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

logModel = LogisticRegression(max_iter=2000)

param_dist = {
    'C': np.logspace(-4, 4, 50)  # C values from 10^-4 to 10^4
}

log_model_tuned = RandomizedSearchCV(logModel, param_distributions=param_dist, n_iter=30, cv=10, random_state=42,
                                     n_jobs=-1)
log_model_tuned.fit(remove_id(X_train), Y_train)
Y_pred = log_model_tuned.predict(remove_id(X_test))

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Get the best parameters and model
best_params = log_model_tuned.best_params_
print("Best parameters:", best_params)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'C': 6866.488450042998}
Accuracy: 0.7033333333333334
Precision: 0.7125205595058536
Recall: 0.7033333333333334
F1 Score: 0.7009072033361522
Confusion Matrix:
[[15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 12  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3 10  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  2 10  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3  6  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 12  3  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  6  9  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1 11  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  2 11  2  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1 10  4  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  5  7  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  3  8  4  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  2 10  3  0  0  0  0  0  0]
 [ 0  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

svm = SVC(probability=True)

param_dist = {
    'C': np.logspace(-4, 4, 10),  # C values from 10^-4 to 10^2
    'gamma': np.logspace(-6, 6, 10),
    'kernel': ['rbf'],
    'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'cache_size': [1000],
    'coef0': np.logspace(-4, 4, 50)
}

svm_tuned = RandomizedSearchCV(svm, param_distributions=param_dist, n_iter=30, cv=10, random_state=42, n_jobs=-1)
svm_tuned.fit(remove_id(X_train), Y_train)
Y_pred = svm_tuned.predict(remove_id(X_test))

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Get the best parameters and model
best_params = svm_tuned.best_params_
print("Best parameters:", best_params)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best parameters: {'kernel': 'rbf', 'gamma': 0.21544346900318823, 'degree': 4, 'coef0': 0.013257113655901081, 'cache_size': 1000, 'C': 1291.5496650148827}
Accuracy: 0.6033333333333334
Precision: 0.6098430817548465
Recall: 0.6033333333333334
F1 Score: 0.602319312289696
Confusion Matrix:
[[12  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4  8  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  4  8  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  3  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  3  6  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  3  9  3  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  6  6  4  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  5  9  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  1  9  3  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  3 10  2  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  3  9  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

decision_tree = DecisionTreeClassifier(max_depth=10, )

param_dist = {
    'min_samples_split': [1, 10, 25, 50, 100],
    'min_samples_leaf': [1, 10, 25, 50, 100]
}

decision_tree_tuned = RandomizedSearchCV(decision_tree, param_distributions=param_dist, n_iter=10, cv=5,
                                         random_state=42, n_jobs=-1)
decision_tree_tuned.fit(remove_id(X_train), Y_train)
Y_pred = decision_tree_tuned.predict(remove_id(X_test))

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Get the best parameters and model
best_params = decision_tree_tuned.best_params_
print("Best parameters:", best_params)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best parameters: {'min_samples_split': 10, 'min_samples_leaf': 25}
Accuracy: 0.98
Precision: 0.9816176470588236
Recall: 0.98
F1 Score: 0.9799407996064002
Confusion Matrix:
[[15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 15  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 14  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0]
 [ 0  0  

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.11/site

In [46]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('lr', log_model_tuned), ('rf', decision_tree_tuned), ('svc', svm_tuned)],
    voting='soft')
voting_clf.fit(remove_id(X_train), Y_train)

Y_pred = voting_clf.predict(remove_id(X_test))

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Get the best parameters and model
best_params = decision_tree_tuned.best_params_
print("Best parameters:", best_params)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'min_samples_split': 10, 'min_samples_leaf': 25}
Accuracy: 0.96
Precision: 0.9628063725490197
Recall: 0.96
F1 Score: 0.9595632593727699
Confusion Matrix:
[[15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3 11  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 14  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 16  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 14  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 14  1  0  0  0  0  0  0]
 [ 0  0  

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

random_forest = RandomForestClassifier(max_leaf_nodes=11, n_jobs=-1)

param_dist = {
    'min_samples_split': [10, 25, 50, 100, 250],
    'min_samples_leaf': [10, 25, 50, 100, 250],
    'n_estimators': [5, 10, 25, 50, 100, 250, 500],
    'max_depth': np.linspace(5, 10, 1, dtype=np.int64),
}

random_forest_tuned = RandomizedSearchCV(random_forest, param_distributions=param_dist, n_iter=20, cv=5,
                                         random_state=42, n_jobs=-1)
random_forest_tuned.fit(remove_id(X_train), Y_train)
Y_pred = random_forest_tuned.predict(remove_id(X_test))

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Get the best parameters and model
best_params = random_forest_tuned.best_params_
print("Best parameters:", best_params)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best parameters: {'n_estimators': 500, 'min_samples_split': 50, 'min_samples_leaf': 25, 'max_depth': 5}
Accuracy: 0.9276190476190476
Precision: 0.9307794463550767
Recall: 0.9276190476190476
F1 Score: 0.9241025702452331
Confusion Matrix:
[[ 14   0   1   0   0   0   0   0   0   0]
 [  5   6   1   0   3   0   0   0   0   0]
 [  1   1   6   6   0   0   0   0   0   1]
 [  2   0   2   9   1   0   1   0   0   0]
 [  4   0   1   0   7   4   0   0   0   0]
 [  1   0   0   0   0  13   1   0   0   0]
 [  0   0   0   0   0   0  14   1   0   0]
 [  0   0   0   0   0   0   0  14   1   0]
 [  0   0   0   0   0   0   0   0  15   0]
 [  0   0   0   0   0   0   0   0   0 389]]


In [24]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(remove_id(X_train))
print(pca.explained_variance_ratio_)


[0.79733191 0.2026675 ]
['pca0' 'pca1']


In [86]:
from sklearn.metrics import f1_score, recall_score, confusion_matrix, accuracy_score, precision_score

callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)
num_features = X_train.shape[1] - 1
input_layer = keras.layers.Input(shape=[num_features])
hidden1 = keras.layers.Dense(300, activation=tf.keras.activations.elu,
                             kernel_initializer=tf.keras.initializers.HeNormal(42))(input_layer)
dropout1 = keras.layers.Dropout(0.4)(hidden1)
hidden2 = keras.layers.Dense(300, activation=tf.keras.activations.elu,
                             kernel_initializer=tf.keras.initializers.HeNormal(42))(hidden1)
hidden3 = keras.layers.Dense(300, activation=tf.keras.activations.elu,
                             kernel_initializer=tf.keras.initializers.HeNormal(42))(hidden2)
output = keras.layers.Dense(21, activation="softmax")(hidden3)
model = keras.Model(inputs=[input_layer], outputs=[output])
model.compile(loss=tf.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

history = model.fit((remove_id(X_train)),
                    Y_train, epochs=200, validation_data=(remove_id(X_valid), Y_valid),
                    callbacks=[callback])
test_error = model.evaluate(remove_id(X_test), Y_test)
print(test_error)

Y_pred = model.predict(remove_id(X_test))
Y_pred_classes = np.argmax(Y_pred, axis=1)

accuracy = accuracy_score(Y_test, Y_pred_classes)
precision = precision_score(Y_test, Y_pred_classes, average='weighted')
recall = recall_score(Y_test, Y_pred_classes, average='weighted')
f1 = f1_score(Y_test, Y_pred_classes, average='weighted')
conf_matrix = confusion_matrix(Y_test, Y_pred_classes)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(conf_matrix)




Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
0.5748510360717773
Accuracy: 0.7366666666666667
Precision: 0.777059861363267
Recall: 0.7366666666666667
F1-score: 0.728515240202005
[[13  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  9  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 14  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1 10  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 12  3  0  0 