## Import packages

In [27]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, 
    average_precision_score, confusion_matrix, log_loss, cohen_kappa_score
)

## Load data, making sure to drop the serial number column, which is not useful for classification

In [28]:
train_df = pd.read_csv('datasets/train.csv', index_col=0)
test_df = pd.read_csv('datasets/test.csv', index_col=0)

## Clean data

In [29]:
train_df.head()

Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,fm,Fm_2,EVENT
0,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1677.0,695.0,0
1,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1561.0,696.0,0
2,6.5,0.17,8.35,749.0,211.0,0.011,0.117,1581.0,696.0,0
3,6.5,0.17,8.35,749.0,211.0,0.011,0.118,1579.0,693.0,0
4,6.5,0.17,8.35,749.0,211.0,0.011,0.118,1567.0,689.0,0


### Columns that have only one unique value aren't useful to classification, and should be dropped

In [30]:
train_df['Tp'].unique()

array([6.5       , 6.6       , 6.7       , ..., 9.00663063, 8.41352575,
       8.22963054], shape=(31635,))

In [31]:
test_df.head()

Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,fm,Fm_2,EVENT
0,10.1,0.17,8.41,762.0,190.0,0.022,0.106,1818.0,920.0,0
1,10.1,0.18,8.41,762.0,190.0,0.022,0.106,1805.0,927.0,0
2,10.1,0.19,8.41,762.0,189.0,0.022,0.108,1787.0,927.0,0
3,10.1,0.18,8.41,762.0,190.0,0.022,0.108,1790.0,936.0,0
4,10.1,0.18,8.41,762.0,189.0,0.022,0.108,1827.0,924.0,0


In [32]:
test_df['EVENT'].unique()

array([0, 1])

## Check datatypes

In [36]:
train_df.dtypes

Tp       float64
Cl       float64
pH       float64
Redox    float64
Leit     float64
Trueb    float64
Cl_2     float64
fm       float64
Fm_2     float64
EVENT      int64
dtype: object

In [37]:
test_df.dtypes

Tp       float64
Cl       float64
pH       float64
Redox    float64
Leit     float64
Trueb    float64
Cl_2     float64
fm       float64
Fm_2     float64
EVENT      int64
dtype: object

In [None]:
## For