# Cloudfight Coding Contest AI 2022

In [3]:
# !pip install pandas
# !pip install matplotlib
# !pip install scikit-learn
# !pip install xgboost
# !pip install lightgbm


In [4]:
# Matrix and plots
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelBinarizer
from sklearn.model_selection import RandomizedSearchCV
# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## Load data

In [48]:
DATA_PATH = r"abalone.data"
df = pd.read_csv(DATA_PATH, header=None)
X = df.iloc[:, 0:-1]  # Get first k-1 cols
print(X.head())
y = df.iloc[:, -1]  # Get last col
print(y.head())


   0      1      2      3       4       5       6      7
0  M  0.455  0.365  0.095  0.5140  0.2245  0.1010  0.150
1  M  0.350  0.265  0.090  0.2255  0.0995  0.0485  0.070
2  F  0.530  0.420  0.135  0.6770  0.2565  0.1415  0.210
3  M  0.440  0.365  0.125  0.5160  0.2155  0.1140  0.155
4  I  0.330  0.255  0.080  0.2050  0.0895  0.0395  0.055
0    15
1     7
2     9
3    10
4     7
Name: 8, dtype: int64


## Preprocessing

### Missing values

In [35]:
print(X.isnull().sum(axis=0))

# # Numeric vars
num_idx_cols = X.select_dtypes(include=np.number).columns.tolist()
num_imp = SimpleImputer(missing_values=np.nan, strategy='median')
num_imp.fit(X.select_dtypes(include=np.number))
X.iloc[:, num_idx_cols] = num_imp.transform(X.select_dtypes(include=np.number))

# Cat vars
cat_idx_cols = X.select_dtypes(include=["object"]).columns.tolist()
categoricalImputer = SimpleImputer(
    missing_values=None, strategy='most_frequent')
categoricalImputer.fit(X.select_dtypes(include=["object"]))
X.iloc[:, cat_idx_cols] = categoricalImputer.transform(
    X.select_dtypes(include=["object"]))


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64


### Categorical attributes to numerical

In [36]:
enc = OneHotEncoder()
CAT_COLS = [0]
cat_cols_encoded = pd.DataFrame(enc.fit_transform(X[CAT_COLS]).toarray())
X = X.drop(columns=CAT_COLS)
X = pd.concat([X, cat_cols_encoded], axis=1)
print(X)


          1      2      3       4       5       6       7    0    1    2
0     0.455  0.365  0.095  0.5140  0.2245  0.1010  0.1500  0.0  0.0  1.0
1     0.350  0.265  0.090  0.2255  0.0995  0.0485  0.0700  0.0  0.0  1.0
2     0.530  0.420  0.135  0.6770  0.2565  0.1415  0.2100  1.0  0.0  0.0
3     0.440  0.365  0.125  0.5160  0.2155  0.1140  0.1550  0.0  0.0  1.0
4     0.330  0.255  0.080  0.2050  0.0895  0.0395  0.0550  0.0  1.0  0.0
...     ...    ...    ...     ...     ...     ...     ...  ...  ...  ...
4172  0.565  0.450  0.165  0.8870  0.3700  0.2390  0.2490  1.0  0.0  0.0
4173  0.590  0.440  0.135  0.9660  0.4390  0.2145  0.2605  0.0  0.0  1.0
4174  0.600  0.475  0.205  1.1760  0.5255  0.2875  0.3080  0.0  0.0  1.0
4175  0.625  0.485  0.150  1.0945  0.5310  0.2610  0.2960  1.0  0.0  0.0
4176  0.710  0.555  0.195  1.9485  0.9455  0.3765  0.4950  0.0  0.0  1.0

[4177 rows x 10 columns]


### Encode multiple class

In [44]:
y = LabelBinarizer().fit_transform(y)

### Scaling

In [38]:
NORMALIZE_COLS = X.columns
X = pd.DataFrame(RobustScaler().fit_transform(X[NORMALIZE_COLS]))


### Feature engineering

In [39]:
VARIABLES = [0,1,2,3,4]
MOMENTS = [2, 3]
X_moments = [X[VARIABLES].pow(m) for m in MOMENTS]

X = pd.concat([X] + X_moments, axis=1)
X.columns = [str(i) for i in range(len(X.columns))]
print(X)

             0    1         2    3    4         5         6         7  \
0    -0.545455  0.0 -0.461538  1.0 -0.9 -0.401265 -0.352848 -0.438871   
1    -1.181818  0.0 -1.230769  1.0 -1.0 -0.806746 -0.748418 -0.768025   
2    -0.090909  0.0 -0.038462  0.0 -0.1 -0.172171 -0.251582 -0.184953   
3    -0.636364  0.0 -0.461538  1.0 -0.3 -0.398454 -0.381329 -0.357367   
4    -1.303030  1.0 -1.307692  0.0 -1.2 -0.835559 -0.780063 -0.824451   
...        ...  ...       ...  ...  ...       ...       ...       ...   
4172  0.121212  0.0  0.192308  0.0  0.5  0.122980  0.107595  0.426332   
4173  0.272727  0.0  0.115385  1.0 -0.1  0.234013  0.325949  0.272727   
4174  0.333333  0.0  0.384615  1.0  1.3  0.529164  0.599684  0.730408   
4175  0.484848  0.0  0.461538  0.0  0.2  0.414617  0.617089  0.564263   
4176  1.000000  0.0  1.000000  1.0  1.1  1.614898  1.928797  1.288401   

             8    9  ...        14   15        16   17    18        19   20  \
0    -0.422111  0.0  ...  0.297521  0.0  0.2

## Model training

In [40]:
N_ITER = 20
CV = 4
RANDOM_STATE = 2022
N_JOBS = 1
best_models = {}

### XGBoost

In [46]:
xgb_params = {"n_estimators": np.arange(10, 210, step=10),
              "eta": np.arange(0.01, 0.3, step=0.01),
              "subsample": np.arange(0.5, 1, step=0.05),
              "colsample_bytree": np.arange(0.5, 1, step=0.05),
              "max_depth": np.arange(3, 10, step=1),
              "min_child_weight": np.arange(1, 5, step=0.05),
              "random_state": [RANDOM_STATE]}

xgb_models = RandomizedSearchCV(estimator=XGBClassifier(), n_jobs=N_JOBS, param_distributions=xgb_params, n_iter=N_ITER,  verbose=1, cv=CV,
                                scoring='accuracy', random_state=RANDOM_STATE)
xgb_models.fit(X, y)
best_models[xgb_models.best_estimator_] = xgb_models.best_score_
print(xgb_models.best_score_)


Fitting 4 folds for each of 20 candidates, totalling 80 fits
1.0


### LGBM

In [None]:
lgbm_params = {"n_estimators": np.arange(10, 210, step=10),
               "learning_rate": np.arange(0.01, 0.3, step=0.01),
               "subsample": np.arange(0.5, 1, step=0.05),
               "colsample_bytree": np.arange(0.5, 1, step=0.05),
               "max_depth": np.arange(3, 10, step=1),
               "min_child_weight": np.arange(1, 5, step=0.05),
               "random_state": [RANDOM_STATE]}

lgbm_models = RandomizedSearchCV(estimator=LGBMClassifier(), n_jobs=N_JOBS, param_distributions=lgbm_params, n_iter=N_ITER,  verbose=1, cv=CV,
                                 scoring='accuracy', random_state=RANDOM_STATE)
lgbm_models.fit(X, y)
best_models[lgbm_models.best_estimator_] = lgbm_models.best_score_
print(lgbm_models.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
-1.553051557144041


### SVM

In [None]:
svm_params = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [1, 2, 3, 4],
    "epsilon": np.arange(0.001, 1, step=0.001),
    "C": np.arange(1, 100, step=1),
    # "random_state": [RANDOM_STATE]
}

svm_models = RandomizedSearchCV(estimator=SVC(), n_jobs=N_JOBS, param_distributions=svm_params, n_iter=N_ITER,  verbose=1, cv=2,
                                scoring='accuracy', random_state=RANDOM_STATE)
svm_models.fit(X, y)
best_models[svm_models.best_estimator_] = svm_models.best_score_
print(svm_models.best_score_)


Fitting 2 folds for each of 10 candidates, totalling 20 fits
-1.4855499157113923


### Random Forest

In [None]:
rf_params = {
    "bootstrap": [True],
    "max_depth": np.arange(10, 110, step=5),
    "max_features": np.arange(0.5, 1, step=0.05),
    "min_samples_leaf": [1, 2, 4],
    "min_samples_split": [2, 5, 10],
    "n_estimators": np.arange(10, 210, step=10),
    "random_state": [RANDOM_STATE]
}

rf_models = RandomizedSearchCV(estimator=RandomForestClassifier(), n_jobs=N_JOBS, param_distributions=rf_params, n_iter=N_ITER,  verbose=1, cv=2,
                               scoring='accuracy', random_state=RANDOM_STATE)
rf_models.fit(X, y)
best_models[rf_models.best_estimator_] = rf_models.best_score_
print(rf_models.best_score_)


Fitting 2 folds for each of 10 candidates, totalling 20 fits
-1.4981509229564964


## Predictions

In [None]:
best_model = max(best_models, key=best_models.get)
# X_predict = pd.read_csv()
# best_model.predict(X_predict)
# X_predict.to_csv("results.csv",index=False)