In [1]:
import tensorflow as tf

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import StandardScaler

from tensorflow.keras import regularizers




In [2]:
tf.experimental.numpy.experimental_enable_numpy_behavior()


In [3]:


df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')

df.drop('Id', axis=1, inplace=True)

train_df = df.drop('Class', axis=1)

test_df = df['Class']

# change B values to 1 and A values to 0

train_df['EJ'] = train_df['EJ'].map({'A': 0, 'B': 1})

# replace null values with the mean of the column

train_df.fillna(train_df.mean(), inplace=True)

def remove_var(X_train, isVar, per = 0.1):
        # Calculate the variance of each column
        variances = X_train.var()
        
        # Sort variances in descending order and select the top 20% most variable columns
        top_var = int(len(variances) * per)
        high_variance_columns = variances.nlargest(top_var).index
        if not isVar:
            # Drop these high variance columns from the train_df
            X_train.drop(columns=high_variance_columns, inplace=True)
        else: 
            X_train = X_train[high_variance_columns]
        
        return X_train, high_variance_columns
        
train_df, high_variance_columns = remove_var(train_df, isVar=False, per=0.1)

# standartize data

scaler = StandardScaler()

train_df = scaler.fit_transform(train_df)


In [4]:
def balanced_log_loss2(y_true, y_pred):

    # Extracting class labels from y_true

    y_true = y_true.astype(int)

    

    # Computing the number of observations for each class

    N0 = np.sum(y_true == 0)

    N1 = np.sum(y_true == 1)

    

    # Calculating the inverse prevalence weights

    w0 = 1 / N0

    w1 = 1 / N1

    

    # Rescaling the predicted probabilities

    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    y_pred /= y_pred.sum(axis=1, keepdims=True)

    

    # Calculating the logarithmic loss for each class

    log_loss_0 = np.sum((1-y_true) * np.log(y_pred[:, 0])) / N0

    log_loss_1 = np.sum(y_true * np.log(y_pred[:, 1])) / N1

    

    # Computing the balanced logarithmic loss

    balanced_log_loss = (-w0 * log_loss_0 - w1 * log_loss_1)/(w0+w1)

    balanced_log_loss /= 2

    

    return balanced_log_loss

In [5]:
@tf.function

def balanced_log_loss(y_true, y_pred, beta=1):

    # Clip predicted values to prevent log(0) errors

    epsilon = tf.keras.backend.epsilon()

    #print(epsilon)

    y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)



    # Calculate positive and negative weights based on class frequencies

    pos_weight = tf.reduce_sum(y_true)  # Proportion of positive examples

    zero_weight = tf.size(y_true) - pos_weight    # Proportion of negative examples

    

    #tf.print(pos_weight)

    #tf.print(y_true)

    # Calculate balanced weights

    one_vals = tf.reduce_sum(y_true * tf.math.log(y_pred))/ pos_weight

    zero_vals = tf.reduce_sum((1-y_true)* tf.math.log(y_pred))/ zero_weight 

    

    tf.print(f' one_vals : {one_vals}')

    # Apply class weights to the loss

    balanced_loss = - (one_vals + zero_vals) / 2

    return balanced_loss

    return tf.reduce_mean(balanced_loss)

In [6]:
def create_model():

    first_layer = train_df.shape[1]

    model = tf.keras.models.Sequential([

    tf.keras.layers.Dense(first_layer,  activation='relu',kernel_regularizer=regularizers.l2(0.001), input_shape = (first_layer,)),

    tf.keras.layers.Dropout(0.75),

    tf.keras.layers.Dense(20, activation='relu', kernel_regularizer=regularizers.l2(0.001)),

    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(10, kernel_regularizer=regularizers.l2(0.001)),

    tf.keras.layers.Dense(2)

    ])

    return model

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_df, test_df,

                                                    test_size=0.2,

                                                    random_state=4)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model = create_model()

model.compile(optimizer='adam',

              loss=loss_fn,

              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100)

probability_model = tf.keras.Sequential([

  model,

  tf.keras.layers.Softmax()

])

probes = probability_model.predict(X_test)

probes =np.array(probes)

print(balanced_log_loss2(y_test,probes))




Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4666 - loss: 1.0142
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6204 - loss: 0.7921 
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6882 - loss: 0.7921 
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7873 - loss: 0.6818 
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7707 - loss: 0.6547 
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8131 - loss: 0.6163 
Epoch 7/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7806 - loss: 0.6647 
Epoch 8/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8419 - loss: 0.5398 
Epoch 9/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [8]:
test_df_1 = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

test_ds_pd = test_df_1.drop('Id' ,axis=1)

test_ds_pd['EJ'] = test_ds_pd['EJ'].map({'A': 0, 'B': 1})

test_ds_pd.fillna(test_ds_pd.mean(), inplace=True)
test_ds_pd.drop(columns=high_variance_columns, inplace=True)


test_ds_pd = scaler.transform(test_ds_pd)

probes = probability_model.predict(test_ds_pd)

probes =np.array(probes)

df = pd.DataFrame({'Id': test_df_1.Id, 'class_0': probes[:,0], 'class_1': probes[:,1]})

df.to_csv('/kaggle/working/submission.csv', index=False)

df

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.99729,0.00271
1,010ebe33f668,0.99729,0.00271
2,02fa521e1838,0.99729,0.00271
3,040e15f562a2,0.99729,0.00271
4,046e85c7cc7f,0.99729,0.00271
