## Import Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# use pandas to load data into a DataFrame
# df = pd.read_csv("/home/danielbudi/Collage/comp-gammafest/comp-model/comp-dataset/mapped-v2_imputed_mf5iter.csv")
# df = pd.read_csv("/home/danielbudi/Collage/comp-gammafest/comp-model/comp-dataset/encode data/train.csv")
df = pd.read_csv("imputed_mf10iter_drop_outliers.csv")
df.shape # (rows, columns)

(32629, 39)

In [3]:
TARGET_COLUMN = 'DC201'
df_sampling = df.copy()

In [4]:
FEATURE_LIST = list(df.columns)
NUMERICAL_DATA = ['DC216', 'DC220', 'DC142a']
CATEGORICAL_DATA = [column for column in df.columns if column != TARGET_COLUMN and column not in NUMERICAL_DATA]

COLUMN_CATEGORICAL_INDEX = []

for column in CATEGORICAL_DATA:
    COLUMN_CATEGORICAL_INDEX.append(df.columns.get_loc(column))

## Pre-Processing Data

### Split Feature and Labels

In [5]:
label_df = df[TARGET_COLUMN]
# label = pd.DataFrame(label)
df = df.drop(TARGET_COLUMN, axis=1)
feature_df = df

label_sampling = df_sampling[TARGET_COLUMN]
# label = pd.DataFrame(label)
df_sampling = df_sampling.drop(TARGET_COLUMN, axis=1)
feature_sampling = df_sampling

### SMOTE

In [6]:
# from collections import Counter
# from imblearn.over_sampling import SMOTENC

# counter = Counter(label_sampling)
# print(counter)

# oversample = SMOTENC(sampling_strategy=0.2,
#                      categorical_features=COLUMN_CATEGORICAL_INDEX,
#                      random_state=42)
# feature_sampling, label_sampling = oversample.fit_resample(feature_sampling, label_sampling)

# counter = Counter(label_sampling)
# print(counter)

### Feature Importance

#### Non-SMOTE

In [7]:
# import matplotlib.pyplot as plt
# from xgboost import XGBClassifier
# from xgboost import plot_importance

# # fit model no training data
# model = XGBClassifier()
# model.fit(feature_df, label_df)
# # feature importance
# print(model.feature_importances_)
# # plot
# fig, ax = plt.subplots(figsize=(10,10))
# plot_importance(model, ax=ax)
# plt.show()

In [8]:
# np.sort(feature_df['DC205'].unique())

In [9]:
# feature_df.nunique()

In [10]:
# feature_df_importance = feature_df[['DC024', 'DC142a', 'DC220', 'DC214', 'DC213', 'DC216',
#                                     'DC205', 'DC235', 'DC270a', 'DC252', 'DC215', 'DC226', 'DC217']]
# feature_df_importance

#### SMOTE

In [11]:
# # fit model no training data
# model = XGBClassifier()
# model.fit(feature_sampling, label_sampling)
# # feature importance
# print(model.feature_importances_)
# # plot
# fig, ax = plt.subplots(figsize=(10,10))
# plot_importance(model, ax=ax)
# plt.show()

In [12]:
# np.sort(feature_sampling['DC205'].unique())

In [13]:
# feature_sampling.nunique()

In [14]:
# feature_sampling_importance = feature_sampling[['DC024', 'DC142a', 'DC220', 'DC214', 'DC213', 'DC216', 'DC109',
#                                                 'DC205', 'DC235', 'DC270a', 'DC252', 'DC215', 'DC226', 'DC230a']]
# feature_sampling_importance

In [15]:
test_df = pd.read_csv('../datasets/test.csv')
test_id = test_df.pop('id')

### Normalization

In [16]:
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
scaler = StandardScaler()

# train set
numerical_df_train = feature_sampling[NUMERICAL_DATA].astype(float).copy()
categorical_df_train = feature_sampling[CATEGORICAL_DATA].astype('category').copy()

# normalization
scaler.fit(numerical_df_train)
numerical_df_train = scaler.transform(numerical_df_train)
numerical_df_train = pd.DataFrame(numerical_df_train, columns=NUMERICAL_DATA)


# test set
numerical_df_test = test_df[NUMERICAL_DATA].astype(float).copy()
categorical_df_test = test_df[CATEGORICAL_DATA].astype('category').copy()

# normalization
numerical_df_test = scaler.transform(numerical_df_test)
numerical_df_test = pd.DataFrame(numerical_df_test, columns=NUMERICAL_DATA)

# Create an instance of the OneHotEncoder
encoder = ce.OneHotEncoder(cols=CATEGORICAL_DATA, use_cat_names=True)

# Fit the encoder on the training data
encoder.fit(categorical_df_train)

one_hot_df_train = encoder.transform(categorical_df_train)
one_hot_df_test = encoder.transform(categorical_df_test)

merged_df_train = pd.concat([numerical_df_train, one_hot_df_train], axis=1)
merged_df_test = pd.concat([numerical_df_test, one_hot_df_test], axis=1)

In [17]:
# merged_df_train
# merged_df_test

## Modelling

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

TEST_SIZE = 0.2
RANDOM_SEED = 42
FOLD = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

# X_train, X_test, y_train, y_test = train_test_split(X_train_norm, label_sampling, test_size=TEST_SIZE, random_state=RANDOM_SEED) # All Normalized
X_train, X_test, y_train, y_test = train_test_split(merged_df_train, label_sampling, test_size = TEST_SIZE, random_state = RANDOM_SEED) # Numeric Normalized
# X_train, X_test, y_train, y_test = train_test_split(feature_sampling, label_sampling, test_size=TEST_SIZE, random_state=RANDOM_SEED) # No Normalized
# X_train, X_test, y_train, y_test = train_test_split(feature_df, label_df, test_size=TEST_SIZE, random_state=RANDOM_SEED) # No Normalized
# X_train, X_test, y_train, y_test = feature_sampling, feature_df, label_sampling, label_df

### Hyperparameter Tuning

In [19]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
# from sklearn.model_selection import cross_val_score

In [20]:
# defining parameter range
# param_grid = {'learning_rate': [0.1, 0.01, 0.001],
#               'min_child_weight':[1, 2, 3],
#               'gamma':[0, 0.1, 0.001]}

# grid = GridSearchCV(estimator=xgb,
#                     param_grid=param_grid,
#                     scoring='f1_micro', n_jobs=-1,
#                     cv=FOLD, verbose=3)
  
# fitting the model for grid search
# grid.fit(X_train, y_train) # Split the training and validation
# grid.fit(X_train_norm, label) # All the training with normalized
# grid.fit(X_resample, label) # All the training with numeric normalized
# grid.fit(feature, label) # All the features no normalized

# print(grid.best_params_)

# grid_predictions = grid.best_estimator_.predict(X_test)
# print(classification_report(y_test, grid_predictions, digits=5))

In [21]:
lgbm = LGBMClassifier(boosting_type='dart', n_estimators=1000, subsample=0.8,
                      colsample_bytree=0.8, scale_pos_weight=2, num_leaves=100,
                      random_state=42, learning_rate=0.1, min_child_weight=2, max_depth=22)
lgbm.fit(X_train, y_train)

prediction = lgbm.predict(X_test)
print(classification_report(y_test, prediction, digits=5))

              precision    recall  f1-score   support

           0    0.72691   0.26462   0.38800       684
           1    0.91987   0.98836   0.95288      5842

    accuracy                        0.91250      6526
   macro avg    0.82339   0.62649   0.67044      6526
weighted avg    0.89964   0.91250   0.89368      6526



In [22]:
# Custom Threshold

# from sklearn.metrics import f1_score

# # Make probability predictions on the validation data
# y_prob = lgbm.predict_proba(X_test)[:, 1]

# # Compute the F1 score for different threshold values
# thresholds = np.arange(0, 1.01, 0.01)
# f1_scores = [f1_score(y_test, y_prob > t) for t in thresholds]

# # Find the threshold that gives the highest F1 score
# best_threshold = thresholds[np.argmax(f1_scores)]
# best_f1_score = np.max(f1_scores)

# print(f'Best threshold: {best_threshold}')
# print(f'Best F1 score: {best_f1_score}')

# # Make binary predictions using the best decision threshold
# y_pred = y_prob > best_threshold

# print(classification_report(y_test,y_pred, digits=4))

## Prediction

### Predict

In [23]:
# grid_predictions = grid.best_estimator_.predict(test_norm)
# grid_predictions = grid.best_estimator_.predict(test_df1)
# grid_predictions = xgb.predict(test_norm)
grid_predictions = lgbm.predict(merged_df_test)
# grid_predictions = model.predict(test_copy)
grid_predictions[-100:]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [24]:
# Custom Threshold

# P_prod_submit = lgbm.predict_proba(test_norm)[:, 1] > best_threshold
# P_prod_submit = grid.best_estimator_.predict(X_test_submit)
# P_prod_submit

In [25]:
result = []
for res in grid_predictions:
# for res in P_prod_submit:
  result.append('Layak Minum' if res==1 else 'Tidak Layak Minum')

finish_pd = pd.DataFrame({'id':test_id.values, 'DC201':result})
finish_pd

Unnamed: 0,id,DC201
0,26718,Layak Minum
1,26802,Layak Minum
2,41302,Layak Minum
3,38698,Layak Minum
4,44257,Layak Minum
...,...,...
11985,36943,Layak Minum
11986,33415,Layak Minum
11987,41998,Layak Minum
11988,41567,Layak Minum


In [26]:
finish_pd['DC201'].value_counts()

DC201
Layak Minum          11553
Tidak Layak Minum      437
Name: count, dtype: int64

In [27]:
# finish_pd.to_csv('/home/danielbudi/Collage/comp-gammafest/comp-model/comp-dataset/result-file/result-LGBM-one_hot_encode-no_SMOTE-MinMax-Cat_modus_num_imputation.csv', index=False)