In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-dec-2021/train.csv
/kaggle/input/tabular-playground-series-dec-2021/test.csv


In [2]:
from platform import python_version
python_version()

'3.7.12'

In [3]:
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold

In [4]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/test.csv")

In [5]:
df_train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,0,3189,40,8,30,13,3270,206,234,193,...,0,0,0,0,0,0,0,0,0,1
1,1,3026,182,5,280,29,3270,233,240,106,...,0,0,0,0,0,0,0,0,0,2
2,2,3106,13,7,351,37,2914,208,234,137,...,0,0,0,0,0,0,0,0,0,1
3,3,3022,276,13,192,16,3034,207,238,156,...,0,0,0,0,0,0,0,0,0,2
4,4,2906,186,13,266,22,2916,231,231,154,...,0,0,0,0,0,0,0,0,0,2


In [6]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,4000000.0,2000000.0,1154701.0,0.0,999999.75,1999999.5,2999999.25,3999999.0
Elevation,4000000.0,2980.192,289.0482,1773.0,2760.0,2966.0,3217.0,4383.0
Aspect,4000000.0,151.5857,109.9611,-33.0,60.0,123.0,247.0,407.0
Slope,4000000.0,15.09754,8.546731,-3.0,9.0,14.0,20.0,64.0
Horizontal_Distance_To_Hydrology,4000000.0,271.3154,226.5497,-92.0,110.0,213.0,361.0,1602.0
Vertical_Distance_To_Hydrology,4000000.0,51.66262,68.21597,-317.0,4.0,31.0,78.0,647.0
Horizontal_Distance_To_Roadways,4000000.0,1766.642,1315.61,-287.0,822.0,1436.0,2365.0,7666.0
Hillshade_9am,4000000.0,211.8375,30.75996,-4.0,198.0,218.0,233.0,301.0
Hillshade_Noon,4000000.0,221.0614,22.23134,49.0,210.0,224.0,237.0,279.0
Hillshade_3pm,4000000.0,140.8109,43.69864,-53.0,115.0,142.0,169.0,272.0


In [7]:
df_train.isna().sum()

Id                                    0
Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0


In [8]:
df_train.Cover_Type.sort_values().unique()

array([1, 2, 3, 4, 5, 6, 7])

In [9]:
feature_names = df_train.columns.drop(['Id', 'Cover_Type'])

In [10]:
X = df_train[feature_names].astype('float')
y = df_train['Cover_Type']

In [11]:
dummy_y = K.utils.to_categorical(y)

In [12]:
checkpoint_filepath='/kaggle/working/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [13]:
def baseline_model():
    model = K.models.Sequential()
    model.add(K.layers.Dense(64, input_dim=len(feature_names), activation='relu'))
    model.add(K.layers.Dense(32, activation='relu'))    
    model.add(K.layers.Dense(16, activation='relu'))  
    model.add(K.layers.Dense(dummy_y.shape[1], activation='softmax'))  
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.load_weights(checkpoint_filepath)
    return model

In [14]:
estimator = KerasClassifier(
    build_fn=baseline_model, 
    epochs=5, 
    batch_size=1024, 
    verbose=1, 
    callbacks=[model_checkpoint_callback],
)

In [15]:
kfold = KFold(n_splits=5, shuffle=True)

In [16]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model = baseline_model()

In [24]:
model.load_weights(checkpoint_filepath)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4c9af43510>

In [26]:
model.fit(X, dummy_y, batch_size=1024, epochs=10, callbacks=[model_checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4c9ae09590>

In [27]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_20 (Dense)             (None, 64)                3520      
_________________________________________________________________
dense_21 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_22 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_23 (Dense)             (None, 8)                 136       
Total params: 6,264
Trainable params: 6,264
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.save('/kaggle/working/baseline_model')

2021-12-28 19:17:27.835058: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


In [33]:
prediction = model.predict(df_test[feature_names])

In [34]:
cover_types_predicted = np.argmax(prediction, axis=1)
cover_types_predicted

array([2, 2, 2, ..., 2, 1, 3])

In [36]:
df_submission = pd.DataFrame({'Id': df_test.Id, 'Cover_Type': cover_types_predicted})

In [38]:
df_submission.head()

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2


In [39]:
df_submission.to_csv('submission.csv', index=False)