# Cloudfight Coding Contest AI 2022

In [112]:
# !pip install pandas
# !pip install matplotlib
# !pip install scikit-learn
# !pip install xgboost
# !pip install lightgbm


In [113]:
# Matrix and plots
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
DATA = r"abalone.data"
SEED = 2022


## Load data

In [114]:
df = pd.read_csv(DATA, header=None)
X = df.iloc[:, 0:-1]  # Get first k-1 cols
print(X.head())
y = df.iloc[:, -1]  # Get last col
print(y.head())


   0      1      2      3       4       5       6      7
0  M  0.455  0.365  0.095  0.5140  0.2245  0.1010  0.150
1  M  0.350  0.265  0.090  0.2255  0.0995  0.0485  0.070
2  F  0.530  0.420  0.135  0.6770  0.2565  0.1415  0.210
3  M  0.440  0.365  0.125  0.5160  0.2155  0.1140  0.155
4  I  0.330  0.255  0.080  0.2050  0.0895  0.0395  0.055
0    15
1     7
2     9
3    10
4     7
Name: 8, dtype: int64


## Preprocessing

### Missing values

In [115]:
# Replace numeric missing values by median
# X.loc[4177] = [None, None, None, None, None, None, None, None]
# numeric_attrs = X.select_dtypes(include=np.number).columns.tolist()
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# imp.fit(X.select_dtypes(include=np.number))
# X.iloc[:, numeric_attrs] = imp.transform(X.select_dtypes(include=np.number))

# Replace categorical missing values by mode


### Categorical attributes to numerical

In [117]:
enc = OneHotEncoder()
CAT_COLS = [0]
cat_cols_encoded = pd.DataFrame(enc.fit_transform(X[CAT_COLS]).toarray())
cat_cols_encoded.columns = ["OH_"+str(i) for i in range(len(cat_cols_encoded.columns))]
X = X.drop(columns=CAT_COLS)
X = pd.concat([X, cat_cols_encoded], axis=1)
print(X)


          1      2      3       4       5       6       7  OH_0  OH_1  OH_2
0     0.455  0.365  0.095  0.5140  0.2245  0.1010  0.1500   0.0   0.0   1.0
1     0.350  0.265  0.090  0.2255  0.0995  0.0485  0.0700   0.0   0.0   1.0
2     0.530  0.420  0.135  0.6770  0.2565  0.1415  0.2100   1.0   0.0   0.0
3     0.440  0.365  0.125  0.5160  0.2155  0.1140  0.1550   0.0   0.0   1.0
4     0.330  0.255  0.080  0.2050  0.0895  0.0395  0.0550   0.0   1.0   0.0
...     ...    ...    ...     ...     ...     ...     ...   ...   ...   ...
4172  0.565  0.450  0.165  0.8870  0.3700  0.2390  0.2490   1.0   0.0   0.0
4173  0.590  0.440  0.135  0.9660  0.4390  0.2145  0.2605   0.0   0.0   1.0
4174  0.600  0.475  0.205  1.1760  0.5255  0.2875  0.3080   0.0   0.0   1.0
4175  0.625  0.485  0.150  1.0945  0.5310  0.2610  0.2960   1.0   0.0   0.0
4176  0.710  0.555  0.195  1.9485  0.9455  0.3765  0.4950   0.0   0.0   1.0

[4177 rows x 10 columns]


Multiple class

In [54]:
y = LabelBinarizer().fit_transform(y)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)

Normalize

In [31]:
NORMALIZE_COLS = [1]

X = StandardScaler().fit_transform(X[NORMALIZE_COLS])

#X.mean(axis=0)
#X.std(axis=0)

array([1.])

Add variables

In [51]:
VARIABLES = [1,2]
MOMENTS = [1,2,3]

X_moments = [X[VARIABLES].pow(m) for m in MOMENTS]

X = pd.concat([X] + X_moments, axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,1.1,2.1,1.2,2.2,1.3,2.3
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,0.455,0.365,0.207025,0.133225,0.094196,0.048627
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,0.350,0.265,0.122500,0.070225,0.042875,0.018610
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,0.530,0.420,0.280900,0.176400,0.148877,0.074088
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,0.440,0.365,0.193600,0.133225,0.085184,0.048627
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,0.330,0.255,0.108900,0.065025,0.035937,0.016581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,0.565,0.450,0.319225,0.202500,0.180362,0.091125
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,0.590,0.440,0.348100,0.193600,0.205379,0.085184
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,0.600,0.475,0.360000,0.225625,0.216000,0.107172
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,0.625,0.485,0.390625,0.235225,0.244141,0.114084


## Model training

### XGBoost

In [120]:
# XGBoost
xgb_params = {"random_state": SEED}
xgb_model = XGBRegressor(**xgb_params)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
mean_squared_error(y_test, xgb_pred)

6.132771009659706

### LGBM

In [121]:
lgbm_params = {"random_state": SEED}
lgbm_model = LGBMRegressor(**xgb_params)
lgbm_model.fit(X_train, y_train)
lgbm_pred = xgb_model.predict(X_test)
mean_squared_error(y_test, xgb_pred)

6.132771009659706

### Random Forest