# Cloudfight Coding Contest AI 2022

In [None]:
# !pip install pandas
# !pip install matplotlib
# !pip install scikit-learn
# !pip install xgboost
# !pip install lightgbm


In [None]:
# Matrix and plots
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelBinarizer
from sklearn.model_selection import RandomizedSearchCV
# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## Load data

In [None]:
DATA_PATH = r"abalone.data"
df = pd.read_csv(DATA_PATH, header=None)
X = df.iloc[:, 0:-1]  # Get first k-1 cols
print(X.head())
y = df.iloc[:, -1]  # Get last col
print(y.head())


## Preprocessing

### Missing values

In [None]:
print(X.isnull().sum(axis=0))

# # Numeric vars
num_idx_cols = X.select_dtypes(include=np.number).columns.tolist()
num_imp = SimpleImputer(missing_values=np.nan, strategy='median')
num_imp.fit(X.select_dtypes(include=np.number))
X.iloc[:, num_idx_cols] = num_imp.transform(X.select_dtypes(include=np.number))

# Cat vars
cat_idx_cols = X.select_dtypes(include=["object"]).columns.tolist()
categoricalImputer = SimpleImputer(
    missing_values=None, strategy='most_frequent')
categoricalImputer.fit(X.select_dtypes(include=["object"]))
X.iloc[:, cat_idx_cols] = categoricalImputer.transform(
    X.select_dtypes(include=["object"]))


### Categorical attributes to numerical

In [None]:
enc = OneHotEncoder()
CAT_COLS = [0]
cat_cols_encoded = pd.DataFrame(enc.fit_transform(X[CAT_COLS]).toarray())
X = X.drop(columns=CAT_COLS)
X = pd.concat([X, cat_cols_encoded], axis=1)
print(X)


### Encode multiple class

In [None]:
y = LabelBinarizer().fit_transform(y)

### Scaling

In [None]:
NORMALIZE_COLS = X.columns
X = pd.DataFrame(RobustScaler().fit_transform(X[NORMALIZE_COLS]))


### Feature engineering

In [None]:
VARIABLES = [0,1,2,3,4]
MOMENTS = [2, 3]
X_moments = [X[VARIABLES].pow(m) for m in MOMENTS]

X = pd.concat([X] + X_moments, axis=1)
X.columns = [str(i) for i in range(len(X.columns))]
print(X)

## Model training

In [None]:
N_ITER = 20
CV = 4
RANDOM_STATE = 2022
N_JOBS = 1
best_models = {}

### XGBoost

In [None]:
xgb_params = {"n_estimators": np.arange(10, 210, step=10),
              "eta": np.arange(0.01, 0.3, step=0.01),
              "subsample": np.arange(0.5, 1, step=0.05),
              "colsample_bytree": np.arange(0.5, 1, step=0.05),
              "max_depth": np.arange(3, 10, step=1),
              "min_child_weight": np.arange(1, 5, step=0.05),
              "random_state": [RANDOM_STATE]}

xgb_models = RandomizedSearchCV(estimator=XGBClassifier(), n_jobs=N_JOBS, param_distributions=xgb_params, n_iter=N_ITER,  verbose=1, cv=CV,
                                scoring='accuracy', random_state=RANDOM_STATE)
xgb_models.fit(X, y)
best_models[xgb_models.best_estimator_] = xgb_models.best_score_
print(xgb_models.best_score_)


### LGBM

In [None]:
lgbm_params = {"n_estimators": np.arange(10, 210, step=10),
               "learning_rate": np.arange(0.01, 0.3, step=0.01),
               "subsample": np.arange(0.5, 1, step=0.05),
               "colsample_bytree": np.arange(0.5, 1, step=0.05),
               "max_depth": np.arange(3, 10, step=1),
               "min_child_weight": np.arange(1, 5, step=0.05),
               "random_state": [RANDOM_STATE]}

lgbm_models = RandomizedSearchCV(estimator=LGBMClassifier(), n_jobs=N_JOBS, param_distributions=lgbm_params, n_iter=N_ITER,  verbose=1, cv=CV,
                                 scoring='accuracy', random_state=RANDOM_STATE)
lgbm_models.fit(X, y)
best_models[lgbm_models.best_estimator_] = lgbm_models.best_score_
print(lgbm_models.best_score_)


### SVM

In [None]:
svm_params = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [1, 2, 3, 4],
    "epsilon": np.arange(0.001, 1, step=0.001),
    "C": np.arange(1, 100, step=1),
    "random_state": [RANDOM_STATE]
}

svm_models = RandomizedSearchCV(estimator=SVC(), n_jobs=N_JOBS, param_distributions=svm_params, n_iter=N_ITER,  verbose=1, cv=CV,
                                scoring='accuracy', random_state=RANDOM_STATE)
svm_models.fit(X, y)
best_models[svm_models.best_estimator_] = svm_models.best_score_
print(svm_models.best_score_)


### Random Forest

In [None]:
rf_params = {
    "bootstrap": [True],
    "max_depth": np.arange(10, 110, step=5),
    "max_features": np.arange(0.5, 1, step=0.05),
    "min_samples_leaf": [1, 2, 4],
    "min_samples_split": [2, 5, 10],
    "n_estimators": np.arange(10, 210, step=10),
    "random_state": [RANDOM_STATE]
}

rf_models = RandomizedSearchCV(estimator=RandomForestClassifier(), n_jobs=N_JOBS, param_distributions=rf_params, n_iter=N_ITER,  verbose=1, cv=CV,
                               scoring='accuracy', random_state=RANDOM_STATE)
rf_models.fit(X, y)
best_models[rf_models.best_estimator_] = rf_models.best_score_
print(rf_models.best_score_)


## Predictions

In [None]:
best_model = max(best_models, key=best_models.get)
# X_predict = pd.read_csv()
# best_model.predict(X_predict)
# X_predict.to_csv("results.csv",index=False)