> **Reference:**
- https://medium.com/analytics-vidhya/build-data-transformation-pipelines-using-scikit-learn-e36c9d5280e3
- https://www.kaggle.com/datasets/onurgitmez/pokemon-stats-gen-1-9
- ChatGPT my best bro ❤

### Download Dataset

In [None]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"brevfiden","key":"3c43beaa068564cb07884539e1ab96f8"}'}

In [None]:
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
!kaggle datasets download -d onurgitmez/pokemon-stats-gen-1-9

Downloading pokemon-stats-gen-1-9.zip to /content
  0% 0.00/25.8k [00:00<?, ?B/s]
100% 25.8k/25.8k [00:00<00:00, 2.18MB/s]


In [None]:
!unzip -q pokemon-stats-gen-1-9.zip

### Import Library Needed

In [None]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
import joblib
print(joblib.__version__)

1.3.2


### Data Preprocessing

In [None]:
df_pokemon = pd.read_csv('PokemonStats.csv')
df_pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       1194 non-null   int64  
 1   Name     1194 non-null   object 
 2   Total    1194 non-null   int64  
 3   HP       1194 non-null   int64  
 4   Attack   1194 non-null   int64  
 5   Defense  1194 non-null   int64  
 6   SpAtk    1194 non-null   int64  
 7   SpDef    1194 non-null   int64  
 8   Speed    1194 non-null   int64  
 9   Type1    1194 non-null   object 
 10  Type2    652 non-null    object 
 11  Height   1194 non-null   float64
 12  Weight   1193 non-null   float64
dtypes: float64(2), int64(8), object(3)
memory usage: 121.4+ KB


In [None]:
X = df_pokemon.drop('Total', axis=1)
y = df_pokemon['Total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

In [None]:
numerical_features = X.select_dtypes(include=['int', 'float']).drop('ID', axis=1).columns
numerical_features

Index(['HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed', 'Height',
       'Weight'],
      dtype='object')

In [None]:
categorical_features = X.select_dtypes(include=['object']).drop('Name', axis=1).columns
categorical_features

Index(['Type1', 'Type2'], dtype='object')

In [None]:
drop_features = ['ID', 'Name']

Instantiate preprocessing function

In [None]:
categorical_imputer = SimpleImputer(strategy='constant', fill_value='None')
numerical_imputer = SimpleImputer(strategy='mean')
std_scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()

Build pipeline model

In [None]:
numerical_transformation = [('imputer', numerical_imputer), ('standard_scaler', std_scaler)]
numerical_pipeline = Pipeline(numerical_transformation)

In [None]:
categorical_transformation = [('imputer', categorical_imputer), ('encoder', ordinal_encoder)]
categorical_pipeline = Pipeline(categorical_transformation)

In [None]:
all_transformation = [('numerical', numerical_pipeline, numerical_features),
                      ('categorical', categorical_pipeline, categorical_features),
                      ('drop', 'drop', drop_features)]
preprocessor = ColumnTransformer(transformers = all_transformation, remainder = 'passthrough')

### Train & evaluate model

In [None]:
model = Pipeline([('preprocessor', preprocessor), ('reg', RandomForestRegressor())])
model.fit(X_train, y_train)

Test Model Performance

In [None]:
y_pred = model.predict(X_test)

In [None]:
rmse = mean_squared_error(y_pred, y_test, squared=False)
mae = mean_absolute_error(y_pred, y_test)
print('Evaluating model performance...')
print(r'RMSE = {:.2f}'.format(rmse))
print(r'MAE = {:.2f}'.format(mae))

Evaluating model performance...
RMSE = 25.48
MAE = 17.87


In [None]:
rmse = mean_squared_error(y_pred, y_test, squared=False)
mae = mean_absolute_error(y_pred, y_test)
print('Evaluating model performance...')
print(r'RMSE = {:.2f}'.format(rmse))
print(r'MAE = {:.2f}'.format(mae))

Evaluating model performance...
RMSE = 25.48
MAE = 17.87


Save model

In [None]:
joblib.dump(model, 'model_pokemon.joblib')

['model_pokemon.joblib']

Model inference

In [None]:
loaded_model = joblib.load('model_pokemon.joblib')

In [None]:
df_pokemon.head(1)

Unnamed: 0,ID,Name,Total,HP,Attack,Defense,SpAtk,SpDef,Speed,Type1,Type2,Height,Weight
0,1,Bulbasaur,318,45,49,49,65,65,45,Grass,Poison,0.7,6.9


In [None]:
test_data = pd.DataFrame({
    'HP': [45],
    'Attack': [49],
    'Defense': [49],
    'SpAtk': [65],
    'SpDef': [65],
    'Speed': [45],
    'Height': [0.7],
    'Weight': [6.9],
    'Type1': ['Grass'],
    'Type2': ['Poison']
})

predictions = loaded_model.predict(test_data)[0]
print("Total power of your pokemon:", predictions)

Total power of your pokemon: 315.97
