<a href="https://colab.research.google.com/github/alwaysneedhelp/AI-Challenge/blob/main/AI_Challenge_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model = LGBMRegressor(
    random_state=42,
    min_child_samples=1,
    min_split_gain=0.05,
    min_data_in_bin=1,
    max_bin=511
)


df = pd.read_csv('/content/drive/MyDrive/train_AI_challenge_2.csv')

In [None]:
def mape_points(y_true, y_pred):
    y_safe = np.where(y_true==0, 1e-6, y_true)
    return mean_absolute_percentage_error(y_safe, y_pred)
custom_scorer = make_scorer(mape_points, greater_is_better=True)

In [None]:
param_grid = {
    "num_leaves": [15, 20, 63],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 300, 500],
}

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=custom_scorer,
    n_jobs=-1,
    verbose=1,
)

In [None]:
x = df.drop(columns=['id', 'cell_id', 'cycle_life'])
y = df['cycle_life']

In [None]:
def preprocess(x):


  # Scale all this
  scaler = StandardScaler()
  x = pd.DataFrame(scaler.fit_transform(x),
                    columns=x.columns)


  # First of all, we should process all 0s, or values that are very close to 0
  def process_0s(df):
    for col in df.columns:
        non_zero = df.loc[df[col].abs() > 1e-8, col]   # keep only "real" non-zeros
        col_mean = non_zero.mean()
        df.loc[df[col].abs() <= 1e-8, col] = col_mean

    return df

  x = process_0s(x)

  # Then remove all constant or near constant values, to reduce repetitive values
  const = [c for c in x.columns if x[c].nunique() <= 1]
  nearconst = [c for c in x.columns if (x[c].value_counts(normalize=True).iloc[0] > 0.98)]
  x = x.drop(columns=list(set(const+nearconst)))


  # Then drop highly correlated features to confuse the model less
  corr = x.corr().abs()
  upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
  to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
  x = x.drop(columns=to_drop)


  # Adding new features for non-linearity
  for col in x.columns:
    if (x[col] >= 0).all():
        x[col+"_sqrt"] = np.sqrt(x[col])
        x[col+"_log"] = np.log1p(x[col])
    x[col+"_sq"] = x[col] ** 2
    x[col+"_inv"] = np.where(x[col].abs() > 1e-8, 1/x[col], 0)

  # Apply PCA to have a summary consisting of 15 features
  pca = PCA(n_components=15, random_state=42)
  x = pca.fit_transform(x)
  x = pd.DataFrame(
      x,
      columns=[f'PCA_{i+1}' for i in range(x.shape[1])]
  )

  return x
x = preprocess(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [None]:
grid.fit(x_train, y_train)

model = grid.best_estimator_

y_pred = model.predict(x_test)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 22, number of used features: 15
[LightGBM] [Info] Start training from score 187.090909


In [None]:
np.mean(np.abs((y_train - model.predict(x_train)) / np.where(y_train == 0, 1e-6, y_train))) * 100

np.float64(100.44339106647904)

In [None]:
y_test

Unnamed: 0,cycle_life
9,485
25,96
8,79
21,79
0,14
12,555


In [None]:
y_pred

array([420.33337522, 124.96508035, 375.95563787, 420.33337522,
        99.16106548, 420.33337522])

In [None]:
# grid.fit(x, y)
# model = grid.best_estimator_

In [None]:
y_test_safe = np.where(y_test == 0, 1e-6, y_test)  # avoid division by 0
mape = np.mean(np.abs((y_test - y_pred) / y_test_safe)) * 100  # in %

# Convert formula (notice: divide by 0.20 means 20%)
points = 200 * max(0, 1 - mape/20)

print(f"MAPE: {mape:.2f}%")
print(f"Points: {points:.2f}")

MAPE: 247.34%
Points: 0.00


In [None]:

# test = pd.read_csv("test-1.csv")

# x_test = preprocess(test.drop(columns=['id', 'cell_id']))
# y_pred = model.predict(x_test)

# submission = pd.DataFrame({
#     "id": test["id"],
#     "cycle_life": y_pred
# })
# submission.to_csv("submission.csv", index=False)