In [8]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split

In [9]:
df_test = pd.read_csv('test.csv', index_col=0)
df_train = pd.read_csv('train.csv', index_col=0)

In [10]:
df_train

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,I,1.5250,1.1750,0.3750,28.973189,12.728926,6.647958,8.348928,9
1,I,1.1000,0.8250,0.2750,10.418441,4.521745,2.324659,3.401940,8
2,M,1.3875,1.1125,0.3750,24.777463,11.339800,5.556502,6.662133,9
3,F,1.7000,1.4125,0.5000,50.660556,20.354941,10.991839,14.996885,11
4,I,1.2500,1.0125,0.3375,23.289114,11.977664,4.507570,5.953395,8
...,...,...,...,...,...,...,...,...,...
74046,F,1.6625,1.2625,0.4375,50.660556,20.680960,10.361742,12.332033,10
74047,I,1.0750,0.8625,0.2750,10.446791,4.323299,2.296310,3.543687,6
74048,F,1.4875,1.2000,0.4125,29.483480,12.303683,7.540967,8.079607,10
74049,I,1.2125,0.9625,0.3125,16.768729,8.972617,2.919999,4.280774,8


In [11]:
y = df_train['Age']
X = df_train.drop(columns=['Age'])

X['Length_Diameter_Ratio'] = X['Length'] / X['Diameter']
X['Shucked_Weight_Ratio'] = X['Shucked Weight'] / X['Weight']
X['Viscera_Weight_Ratio'] = X['Viscera Weight'] / X['Weight']
X['Shell_Weight_Ratio'] = X['Shell Weight'] / X['Weight']
X['Shucked_Viscera_Ratio'] = X['Shucked Weight'] / X['Viscera Weight']
X['Shucked_Shell_Ratio'] = X['Shucked Weight'] / X['Shell Weight']
X['Viscera_Shell_Ratio'] = X['Viscera Weight'] / X['Shell Weight']

for col in X.columns:
    if col != 'Sex':
        X[f'{col}_squared'] = X[col] ** 2
        X[f'{col}_cubed'] = X[col] ** 3
        X[f'{col}_log'] = np.log1p(X[col])
        X[f'{col}_sqrt'] = X[col] ** 0.5

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [13]:
X.describe()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Length_Diameter_Ratio,Shucked_Weight_Ratio,Viscera_Weight_Ratio,...,Shucked_Viscera_Ratio_log,Shucked_Viscera_Ratio_sqrt,Shucked_Shell_Ratio_squared,Shucked_Shell_Ratio_cubed,Shucked_Shell_Ratio_log,Shucked_Shell_Ratio_sqrt,Viscera_Shell_Ratio_squared,Viscera_Shell_Ratio_cubed,Viscera_Shell_Ratio_log,Viscera_Shell_Ratio_sqrt
count,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,...,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0
mean,1.31746,1.024496,0.348089,23.385217,10.10427,5.058386,6.72387,1.292523,0.431529,0.216989,...,1.098946,1.416837,2.399939,4.764462,0.910319,1.219976,0.583986,0.467876,0.557977,0.863846
std,0.287757,0.237396,0.092034,12.648153,5.618025,2.792729,3.584372,0.054403,0.078134,0.030319,...,0.125845,0.140974,4.336299,87.913029,0.122563,0.128721,0.216243,0.300855,0.076234,0.077501
min,0.1875,0.1375,0.0,0.056699,0.028349,0.042524,0.042524,0.486842,0.083333,0.042254,...,0.320451,0.614613,0.026913,0.004415,0.151906,0.405033,0.006612,0.000538,0.078175,0.285153
25%,1.15,0.8875,0.3,13.437663,5.712424,2.8633,3.96893,1.262136,0.398569,0.199109,...,1.019249,1.330832,1.681623,2.180686,0.831506,1.13876,0.440588,0.292448,0.509085,0.81472
50%,1.375,1.075,0.3625,23.799405,9.90815,4.989512,6.931453,1.287879,0.432212,0.215816,...,1.096245,1.411704,2.209797,3.284948,0.910892,1.219237,0.560654,0.4198,0.558912,0.865314
75%,1.5375,1.2,0.4125,32.162508,14.033003,6.988152,9.07184,1.318681,0.463571,0.233641,...,1.173943,1.494898,2.847656,4.80542,0.988611,1.299038,0.694444,0.578704,0.606136,0.912871
max,2.012815,1.6125,2.825,80.101512,42.184056,21.54562,28.491248,3.727273,4.725118,1.32363,...,3.703768,6.292853,612.5625,15160.921875,3.248435,4.974937,10.24,32.768,1.435085,1.788854


In [None]:
cat_cols = ['Sex']
num_cols = list(set(X.columns) - set(cat_cols))

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ('scaling', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer),
    ('regression', XGBRegressor(random_state=42))
])

In [19]:
df_train.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
       'Viscera Weight', 'Shell Weight', 'Age'],
      dtype='object')

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Test MAE = {mean_absolute_error(y_test, y_pred):.4f}, MAPE = {mean_absolute_percentage_error(y_test, y_pred):.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'regression__n_estimators': [100, 200, 500],
    'regression__max_depth': [3, 5, 7],
    'regression__learning_rate': [0.01, 0.05, 0.1],
    'regression__subsample': [0.7, 0.8, 0.9],
    'regression__colsample_bytree': [0.6, 0.7, 0.8]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=1)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)


In [17]:
### get IMPOTRANT features 
#### ONLY WHEN RIDGE IN PIPELINE

ridge_model = model.named_steps['regression']
coefficients = ridge_model.coef_

ohe_feature_names = model.named_steps['ohe_and_scaling'].named_transformers_['ohe'].get_feature_names_out(cat_cols)
feature_names = np.concatenate([ohe_feature_names, num_cols])

AttributeError: 'RandomForestRegressor' object has no attribute 'coef_'

In [None]:
feature_importance = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})
feature_importance['abs_coefficient'] = feature_importance['coefficient'].abs() # Add absolute value
feature_importance = feature_importance.sort_values(by='abs_coefficient', ascending=False).reset_index(drop=True)
print("Feature Importances:\n", feature_importance)

Feature Importances:
                           feature  coefficient  abs_coefficient
0         Shucked_Shell_Ratio_log    -6.882129         6.882129
1             Shucked_Shell_Ratio     5.465192         5.465192
2                  Shucked Weight    -4.778149         4.778149
3                          Weight     4.454525         4.454525
4     Shucked_Shell_Ratio_squared    -3.032505         3.032505
..                            ...          ...              ...
68  Length_Diameter_Ratio_squared     0.037584         0.037584
69    Viscera_Shell_Ratio_squared    -0.030133         0.030133
70     Length_Diameter_Ratio_sqrt    -0.021516         0.021516
71    Length_Diameter_Ratio_cubed     0.021189         0.021189
72              Shell Weight_sqrt     0.005581         0.005581

[73 rows x 3 columns]


In [None]:
imp_features = feature_importance.feature[(feature_importance['abs_coefficient'] > 0.6) & \
                                          (feature_importance['feature'] != 'Sex_F') &\
                                          (feature_importance['feature'] != 'Sex_M') &\
                                          (feature_importance['feature'] != 'Sex_I')].values

In [None]:
ohe_feature_names

array(['Sex_F', 'Sex_I', 'Sex_M'], dtype=object)

ValueError: ('Lengths must match to compare', (73,), (3,))

In [None]:
model_final = pipeline.fit(X, y)
y_pred_final = model.predict(X_test)

In [None]:
ans = pd.concat([pd.Series(X_test.index), pd.Series(y_pred_final)], axis=1).rename({0: 'yield'}, axis=1)

In [None]:
ans.to_csv('submission.csv', index=False)

In [None]:
ans

Unnamed: 0,id,yield
0,74051,7.735468
1,74052,7.682302
2,74053,10.433072
3,74054,9.556363
4,74055,7.504227
...,...,...
49363,123414,9.370156
49364,123415,7.840062
49365,123416,12.330386
49366,123417,10.023956
