# Predi√ß√£o de Bioincrusta√ß√£o Nexus



In [43]:
# @title
import pkg_resources
import sys

def create_requirements_file(filename="requirements.txt"):
    # List of libraries explicitly used in the notebook
    explicit_dependencies = [
        'pandas',
        'numpy',
        'matplotlib',
        'seaborn',
        'tqdm',
        'scikit-learn',
        'xgboost',
        'lightgbm',
        'gdown', # Included if used for data download
        'openpyxl' # For reading/writing excel files, if applicable
    ]

    # Get all installed packages
    installed_packages = {p.project_name.lower(): p for p in pkg_resources.working_set}

    reqs = []
    for dep_name in explicit_dependencies:
        try:
            # Try to get the package, handling case-insensitivity
            package = installed_packages.get(dep_name.lower())
            if package:
                reqs.append(f"{package.project_name}=={package.version}")
            else:
                print(f"Warning: '{dep_name}' specified but not found in environment. Skipping.")
        except Exception as e:
            print(f"Error processing '{dep_name}': {e}. Skipping.")

    with open(filename, "w") as f:
        for r in sorted(reqs):
            f.write(r + "\n")
    print(f"Generated '{filename}' with {len(reqs)} key dependencies.")
    print("Please review the file and add any missing dependencies or remove unnecessary ones.")

create_requirements_file()

Generated 'requirements.txt' with 10 key dependencies.
Please review the file and add any missing dependencies or remove unnecessary ones.


In [44]:
# @title
!pip install -r requirements.txt



In [45]:
# @title
"""
Predi√ß√£o de Bioincrusta√ß√£o - An√°lise Avan√ßada de Fouling


1. Features de tempo ocioso (idle time)
2. Features de velocidade de risco
3. Progress√£o temporal da bioincrusta√ß√£o
4. Valida√ß√£o temporal (n√£o aleat√≥ria)
5. Modelo ensemble (XGBoost, LightGBM, RF, GB)
6. Target baseado em Fouling Rating IMO (0-4)
7. An√°lise de cen√°rios futuros
8. Impacto econ√¥mico realista (5-25% penalty)
9. An√°lise individual por navio da frota
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import zipfile
import os
warnings.filterwarnings('ignore')

# ML imports
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import pickle

# Configura√ß√µes
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Bibliotecas importadas com sucesso!")


Bibliotecas importadas com sucesso!


In [46]:
# @title
import pkg_resources
import sys

def create_requirements_file(filename="requirements.txt"):
    # List of libraries explicitly used in the notebook
    explicit_dependencies = [
        'pandas',
        'numpy',
        'matplotlib',
        'seaborn',
        'tqdm',
        'scikit-learn',
        'xgboost',
        'lightgbm',
        'gdown', # Included if used for data download
        'openpyxl' # For reading/writing excel files, if applicable
    ]

    # Get all installed packages
    installed_packages = {p.project_name.lower(): p for p in pkg_resources.working_set}

    reqs = []
    for dep_name in explicit_dependencies:
        try:
            # Try to get the package, handling case-insensitivity
            package = installed_packages.get(dep_name.lower())
            if package:
                reqs.append(f"{package.project_name}=={package.version}")
            else:
                print(f"Warning: '{dep_name}' specified but not found in environment. Skipping.")
        except Exception as e:
            print(f"Error processing '{dep_name}': {e}. Skipping.")

    # Additionally, check for packages in the current environment that might be implied
    # This part is more general and might pick up extras, but ensures coverage
    # For a more precise list, manually curate explicit_dependencies.

    # Get all direct imports in the current kernel, though this is harder to automate perfectly.
    # For simplicity, we stick to explicit_dependencies here unless a more complex introspection is needed.

    with open(filename, "w") as f:
        for r in sorted(reqs):
            f.write(r + "\n")
    print(f"Generated '{filename}' with {len(reqs)} key dependencies.")
    print("Please review the file and add any missing dependencies or remove unnecessary ones.")

create_requirements_file()


Generated 'requirements.txt' with 10 key dependencies.
Please review the file and add any missing dependencies or remove unnecessary ones.


## 1.5. DOWNLOAD DOS DADOS DO GOOGLE DRIVE (OPCIONAL)

In [47]:
# @title

# Se os dados n√£o existirem localmente, baixar do Google Drive
DOWNLOAD_FROM_DRIVE = True  # Altere para True para baixar do Drive

if DOWNLOAD_FROM_DRIVE:
    try:
        import gdown
        print("\n Baixando dados do Google Drive...")

        folder_url = "https://drive.google.com/drive/folders/1NJrDlremklekCO1NR4Ltm43DZBOCKdW6"
        gdown.download_folder(folder_url, quiet=False, use_cookies=False, output="Hackathon Transpetro")

        print(" Dados baixados com sucesso!")
        BASE_PATH = "Dados Hackathon Transpetro/"
    except ImportError:
        print(" gdown n√£o instalado. Execute: pip install gdown")
        print("   Usando dados locais...")
        BASE_PATH = ""
    except Exception as e:
        print(f" Erro ao baixar do Drive: {e}")
        print("   Usando dados locais...")
        BASE_PATH = ""
else:
    BASE_PATH = ""



 Baixando dados do Google Drive...


Retrieving folder contents


Retrieving folder 1oSLdQSsW0GpFgGoRZF12zFxFNs72P0sH Mais Dados
Processing file 1IzjTamdx1iq2MTi2VkrO6lAYF3s_uSIL AIS_NAVIO TESTE 2 1.csv
Processing file 1rP-GH7HLBMLS-DAQ6st9689wDtgY3YMe AIS_NAVIO TESTE 3 1.csv
Processing file 17PjAApZZCyk_2epri9x-CRyD-1BoUPor Consumo_Validacao 1.CSV
Processing file 1MoajA9gX0OHrEFdGyBRDh5DSXg0H7X-S Dados navios Valida√ß√£o 1.xlsx
Processing file 1xksJEcxznpN_anRavD_0x-4c2t8hQYRN Eventos_Validacao 1.CSV
Processing file 1BNU0xrGH54VYviSBNKGZG5Xkya4CTf0C RESULTADO Valida√ß√£o 1.xlsx
Processing file 17-kgs1RS52wenFcfpHr2ljbkq-4TG2Cf Dados AIS frota TP.zip
Processing file 1_CTM1V1PFN2guPl2ipW-VjdM80i8i7Ll Dados navios Hackathon.xlsx
Processing file 1L-iN3artAlSB3hqC6pQsX58z_9HKbmVw Dicion√°rios de Dados.xlsx
Processing file 1YYp8B3finjq-p53MURKGyPXUQt26ZIuf Manual do Participante.pdf
Processing file 1ikd9AFsF18LZTAW8mRofQhVWaXzN98ek Relatorios IWS.xlsx
Processing file 1XUPl_mDEVjtlM6g19Q-oyaRWB9Gn6a0M ResultadoQueryConsumo.csv
Processing file 1S4iA70w2SapF

Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1IzjTamdx1iq2MTi2VkrO6lAYF3s_uSIL
To: /content/Hackathon Transpetro/Mais Dados/AIS_NAVIO TESTE 2 1.csv
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 988k/988k [00:00<00:00, 12.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rP-GH7HLBMLS-DAQ6st9689wDtgY3YMe
To: /content/Hackathon Transpetro/Mais Dados/AIS_NAVIO TESTE 3 1.csv
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.22M/1.22M [00:00<00:00, 13.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=17PjAApZZCyk_2epri9x-CRyD-1BoUPor
To: /content/Hackathon Transpetro/Mais Dados/Consumo_Validacao 1.CSV
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 191k/191k [00:00<00:00, 4.92MB/s]
Downloading...
From: https://drive.google.com/uc?id=1MoajA9gX0OHrEFdGyBRDh5DSXg0H7X-S
To: /content/Hackathon Transpetro/Mais Dados/Dados navios Valida√ß√£o 1.xlsx
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12.9k/12.9k [00

 Dados baixados com sucesso!



Download completed


## 2. CARREGAMENTO DOS DADOS

In [48]:

# Se n√£o foi definido BASE_PATH no download, usar caminho local
if 'BASE_PATH' not in locals():
    BASE_PATH = "/Users/bryan/Documents/Hackathon_transpetro/"

# Tentar m√∫ltiplos caminhos poss√≠veis
data_paths = [
    BASE_PATH,
    "Hackathon Transpetro/",
    "/content/Hackathon Transpetro/",  # Google Colab
    ""  # Diret√≥rio atual
]

# Encontrar caminho v√°lido
valid_path = None
for path in data_paths:
    if os.path.exists(f"{path}ResultadoQueryEventos.csv"):
        valid_path = path
        break

if valid_path is None:
    print("‚ö†Ô∏è Dados n√£o encontrados. Configure DOWNLOAD_FROM_DRIVE=True ou ajuste BASE_PATH")
    exit(1)

BASE_PATH = valid_path
print(f"\nüìÇ Carregando dados de: {BASE_PATH}")

df_eventos = pd.read_csv(f"{BASE_PATH}ResultadoQueryEventos.csv")
df_consumo = pd.read_csv(f"{BASE_PATH}ResultadoQueryConsumo.csv")
df_navios = pd.read_excel(f"{BASE_PATH}Dados navios Hackathon.xlsx")
df_iws = pd.read_excel(f"{BASE_PATH}Relatorios IWS.xlsx")

print(f" Eventos: {df_eventos.shape}")
print(f" Consumo: {df_consumo.shape}")
print(f" Navios: {df_navios.shape}")
print(f" IWS: {df_iws.shape}")

# Carregar AIS
# Tentar m√∫ltiplos caminhos poss√≠veis
ais_paths = [
    f"{BASE_PATH}Dados AIS frota TP",  # Pasta descompactada
    f"{BASE_PATH}notebooks/Dados Hackathon Transpetro/Dados AIS frota TP.zip",  # ZIP no notebooks
    f"{BASE_PATH}Dados AIS frota TP.zip"  # ZIP na raiz
]

df_ais = pd.DataFrame()
ais_loaded = False

for ais_path in ais_paths:
    if os.path.exists(ais_path):
        if ais_path.endswith('.zip'):
            # √â um ZIP, extrair
            extract_folder = "dados_ais_temp"
            with zipfile.ZipFile(ais_path, "r") as z:
                z.extractall(extract_folder)
            csv_folder = os.path.join(extract_folder, "Dados AIS frota TP")
        else:
            # J√° √© uma pasta
            csv_folder = ais_path

        # Ler CSVs
        all_dfs = []
        for file_name in os.listdir(csv_folder):
            if file_name.lower().endswith(".csv"):
                file_path = os.path.join(csv_folder, file_name)
                df = pd.read_csv(file_path)
                df["ARQUIVO_ORIGEM"] = file_name
                all_dfs.append(df)

        if all_dfs:
            df_ais = pd.concat(all_dfs, ignore_index=True)
            print(f"‚úÖ AIS carregado de {ais_path}: {df_ais.shape}")
            ais_loaded = True
            break

if not ais_loaded:
    print(" Arquivo AIS n√£o encontrado em nenhum caminho")
    print(f"   Tentou: {ais_paths}")



üìÇ Carregando dados de: Hackathon Transpetro/
 Eventos: (50904, 22)
 Consumo: (87737, 3)
 Navios: (21, 8)
 IWS: (29, 14)
‚úÖ AIS carregado de Hackathon Transpetro/Dados AIS frota TP.zip: (415724, 7)


## 2. PR√â-PROCESSAMENTO B√ÅSICO

In [49]:
# @title

print("\n Pr√©-processamento...")

# Padronizar colunas
df_eventos.columns = df_eventos.columns.str.strip()
df_consumo.columns = df_consumo.columns.str.strip()
df_navios.columns = df_navios.columns.str.strip()
df_iws.columns = df_iws.columns.str.strip()

# Parse datetimes
for c in ["startGMTDate", "endGMTDate"]:
    if c in df_eventos.columns:
        df_eventos[c] = pd.to_datetime(df_eventos[c], errors='coerce')

# Renomear SESSION_ID
if "SESSION_ID" in df_consumo.columns:
    df_consumo.rename(columns={"SESSION_ID": "sessionId"}, inplace=True)

# Processar AIS
if not df_ais.empty:
    df_ais.columns = df_ais.columns.str.strip()

    for cand in ["DATAHORA", "DataHora", "datahora", "DATETIME"]:
        if cand in df_ais.columns:
            df_ais['DATETIME'] = pd.to_datetime(df_ais[cand], errors='coerce')
            break

    for vcol in ["VELOCIDADE", "speed", "SOG", "speedGps"]:
        if vcol in df_ais.columns:
            df_ais['speed_kn'] = pd.to_numeric(df_ais[vcol], errors='coerce')
            break

    for cand in ["NOME", "name", "ship", "shipName", "ARQUIVO_ORIGEM"]:
        if cand in df_ais.columns:
            df_ais['shipName_ais'] = df_ais[cand].astype(str)
            break

print("Pr√©-processamento conclu√≠do!")



 Pr√©-processamento...
Pr√©-processamento conclu√≠do!


## 3. AGREGA√á√ÉO AIS POR EVENTO

In [50]:
# @title

def aggregate_ais_by_event(df_eventos, df_ais):
    """Agrega dados AIS para cada evento de navega√ß√£o"""
    agg_rows = []

    if df_eventos.empty or df_ais.empty:
        return pd.DataFrame()

    df_ais['shipName_ais_low'] = df_ais['shipName_ais'].str.lower().str.strip()
    df_eventos['shipName_low'] = df_eventos['shipName'].astype(str).str.lower().str.strip()

    ais_groups = {k: g for k, g in df_ais.groupby('shipName_ais_low')}

    for idx, ev in tqdm(df_eventos.iterrows(), total=len(df_eventos), desc="Agregando AIS"):
        ship = str(ev.get('shipName_low', "")).strip()
        sdt = ev.get('startGMTDate')
        edt = ev.get('endGMTDate')

        if ship == "" or pd.isna(sdt) or pd.isna(edt):
            continue

        ais_g = ais_groups.get(ship)
        if ais_g is None:
            candidates = [k for k in ais_groups.keys() if ship in k or k in ship]
            ais_g = ais_groups.get(candidates[0]) if candidates else None

        if ais_g is None:
            continue

        window = ais_g[(ais_g['DATETIME'] >= sdt) & (ais_g['DATETIME'] <= edt)]

        if window.empty:
            continue

        speed_mean = window['speed_kn'].mean()
        speed_std = window['speed_kn'].std()
        speed_min = window['speed_kn'].min()
        speed_max = window['speed_kn'].max()
        frac_stop = (window['speed_kn'] < 1.5).mean()
        frac_low_speed = (window['speed_kn'] < 5).mean()

        lat_mean = pd.to_numeric(window.get('LATITUDE', window.get('latitude', pd.Series(np.nan))), errors='coerce').mean()
        lon_mean = pd.to_numeric(window.get('LONGITUDE', window.get('longitude', pd.Series(np.nan))), errors='coerce').mean()

        agg_rows.append({
            'sessionId': ev.get('sessionId'),
            'shipName': ev.get('shipName'),
            'startGMTDate': sdt,
            'endGMTDate': edt,
            'duration_h': ev.get('duration'),
            'distance': ev.get('distance'),
            'beaufort': ev.get('beaufortScale'),
            'seaCondition': ev.get('seaCondition'),
            'displacement': ev.get('displacement'),
            'speed_mean': speed_mean,
            'speed_std': speed_std,
            'speed_min': speed_min,
            'speed_max': speed_max,
            'frac_stop': frac_stop,
            'frac_low_speed': frac_low_speed,
            'lat_mean': lat_mean,
            'lon_mean': lon_mean
        })

    return pd.DataFrame(agg_rows)

print("\n Agregando dados AIS...")
df_events_ais = aggregate_ais_by_event(df_eventos, df_ais)
print(f"Eventos com AIS: {df_events_ais.shape}")



 Agregando dados AIS...


Agregando AIS: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50904/50904 [00:59<00:00, 854.72it/s]

Eventos com AIS: (8214, 17)





## 4. FEATURES AVAN√áADAS

In [51]:
# @title

def create_advanced_features(df):
    """Cria features avan√ßadas baseadas em ci√™ncia de bioincrusta√ß√£o"""
    df = df.copy()

    # 1. IDLE TIME FEATURES (CR√çTICO)
    df['idle_time_ratio'] = df['frac_stop'].fillna(0)
    df['idle_days'] = (df['duration_h'] * df['idle_time_ratio'] / 24).fillna(0)
    df['low_speed_days'] = (df['duration_h'] * df['frac_low_speed'] / 24).fillna(0)

    # 2. VELOCITY RISK SCORE (CR√çTICO)
    def velocity_risk(speed):
        if pd.isna(speed):
            return 2
        if speed < 5:
            return 3  # Alto risco
        elif speed < 10:
            return 2  # Risco moderado
        elif speed < 12:
            return 1  # Baixo-moderado
        else:
            return 0  # Baixo risco

    df['velocity_risk'] = df['speed_mean'].apply(velocity_risk)

    # 3. OPERATIONAL PROFILE
    df['operation_continuity'] = 1 - df['idle_time_ratio']

    # 4. LOW SHEAR ZONES EXPOSURE
    df['low_shear_exposure'] = df['idle_days'] * (df['velocity_risk'] + 1)

    # 5. BIOGEOGRAPHIC REGION RISK
    def get_biogeographic_region(lat):
        if pd.isna(lat):
            return 'Unknown'
        if lat > -5:
            return 'Norte'
        elif lat > -15:
            return 'Nordeste'
        else:
            return 'Sudeste-Sul'

    df['bio_region'] = df['lat_mean'].apply(get_biogeographic_region)
    region_risk = {'Norte': 3, 'Nordeste': 2, 'Sudeste-Sul': 1, 'Unknown': 1.5}
    df['region_risk'] = df['bio_region'].map(region_risk)

    # 6. TEMPERATURE PROXY
    df['temp_proxy'] = df['lat_mean'].abs().fillna(15)
    df['temp_risk'] = (15 - df['temp_proxy']).clip(0, 15) / 15

    # 7. SPEED VARIABILITY
    df['speed_variability'] = df['speed_std'] / (df['speed_mean'] + 1)

    print("Features avan√ßadas criadas!")
    return df

if not df_events_ais.empty:
    df_events_ais = create_advanced_features(df_events_ais)


Features avan√ßadas criadas!


## 5. PROCESSAR IWS E CRIAR TARGET

In [52]:
# @title

def process_iws_data(df_iws, df_events_ais):
    """Processa dados de inspe√ß√£o e calcula dias desde √∫ltima limpeza"""
    if df_iws.empty or df_events_ais.empty:
        return df_events_ais

    iw_cols = [c for c in df_iws.columns if 'data' in c.lower()]
    possible_ship_cols = [c for c in df_iws.columns if 'embarca' in c.lower() or 'navio' in c.lower()]

    if not iw_cols or not possible_ship_cols:
        print("Colunas de data/navio n√£o encontradas no IWS")
        return df_events_ais

    date_col = iw_cols[0]
    ship_col = possible_ship_cols[0]

    df_iws['date_iws'] = pd.to_datetime(df_iws[date_col], errors='coerce')
    df_iws['ship_iws'] = df_iws[ship_col].astype(str).str.lower().str.strip()

    median_interval = df_iws.groupby('ship_iws')['date_iws'].apply(
        lambda g: g.sort_values().diff().dt.days.median()
    ).rename('median_interval').reset_index()
    median_interval['median_interval'].fillna(180, inplace=True)

    def days_since_last_clean(row):
        s = str(row['shipName']).lower().strip()
        start = row['startGMTDate']
        if pd.isna(start):
            return np.nan, np.nan

        cleans = df_iws[(df_iws['ship_iws'] == s) & (df_iws['date_iws'] <= start)]
        if cleans.empty:
            return np.nan, np.nan

        last = cleans['date_iws'].max()
        median = median_interval[median_interval['ship_iws'] == s]['median_interval']
        median_val = median.values[0] if not median.empty else 180

        return (start - last).days, median_val

    days_med = []
    median_list = []

    for _, r in tqdm(df_events_ais.iterrows(), total=len(df_events_ais), desc="Calculando dias desde limpeza"):
        d, med = days_since_last_clean(r)
        days_med.append(d)
        median_list.append(med)

    df_events_ais['days_since_clean'] = days_med
    df_events_ais['median_interval'] = median_list

    return df_events_ais

def create_fouling_rating_target(df):
    """
    Cria target baseado em Fouling Rating IMO (0-4)

    Escala IMO MEPC.378(80):
    0: Sem bioincrusta√ß√£o
    1: Microincrusta√ß√£o (biofilme/limo)
    2: Macroincrusta√ß√£o leve (1-15%)
    3: Macroincrusta√ß√£o moderada (16-40%)
    4: Macroincrusta√ß√£o pesada (41-100%)
    """
    df = df.copy()

    def estimate_fouling_rating(row):
        days = row.get('days_since_clean', np.nan)
        velocity_risk = row.get('velocity_risk', 2)
        idle_ratio = row.get('idle_time_ratio', 0)
        temp_risk = row.get('temp_risk', 0.5)
        region_risk = row.get('region_risk', 1.5)

        if pd.isna(days):
            return np.nan

        # Base score por tempo
        if days < 14:
            base_score = 0.3
        elif days < 42:
            base_score = 1.2
        elif days < 90:
            base_score = 2.0
        elif days < 180:
            base_score = 3.0
        else:
            base_score = 3.8

        # Modificadores
        velocity_modifier = velocity_risk * 0.15
        idle_modifier = idle_ratio * 0.4
        temp_modifier = temp_risk * 0.2
        region_modifier = (region_risk - 1.5) * 0.15

        final_score = base_score + velocity_modifier + idle_modifier + temp_modifier + region_modifier

        return np.clip(final_score, 0, 4)

    df['fouling_rating'] = df.apply(estimate_fouling_rating, axis=1)

    # Criar est√°gios
    def get_fouling_stage(days):
        if pd.isna(days):
            return np.nan
        if days < 14:
            return 0
        elif days < 42:
            return 1
        elif days < 90:
            return 2
        else:
            return 3

    df['fouling_stage'] = df['days_since_clean'].apply(get_fouling_stage)

    # Labels categ√≥ricos
    def get_fouling_label(rating):
        if pd.isna(rating):
            return np.nan
        if rating < 1:
            return 'clean'
        elif rating < 2:
            return 'light'
        elif rating < 3:
            return 'moderate'
        else:
            return 'heavy'

    df['fouling_label'] = df['fouling_rating'].apply(get_fouling_label)

    # Risk score combinado
    df['biofouling_risk_score'] = (
        0.4 * (df['days_since_clean'].fillna(90) / 180).clip(0, 1) +
        0.25 * (df['velocity_risk'] / 3) +
        0.2 * df['idle_time_ratio'] +
        0.15 * df['temp_risk']
    ).clip(0, 1)

    print("Fouling Rating target criado!")
    print(f"Distribui√ß√£o:")
    print(df['fouling_rating'].describe())

    return df

print("\n Processando IWS e criando target...")
if not df_events_ais.empty:
    df_events_ais = process_iws_data(df_iws, df_events_ais)
    df_events_ais = create_fouling_rating_target(df_events_ais)



 Processando IWS e criando target...


Calculando dias desde limpeza: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8214/8214 [00:08<00:00, 983.11it/s] 


Fouling Rating target criado!
Distribui√ß√£o:
count    4639.000000
mean        3.635577
std         0.739599
min         0.225000
25%         3.725000
50%         4.000000
75%         4.000000
max         4.000000
Name: fouling_rating, dtype: float64


## 6. MERGE COM CONSUMO E NAVIOS

In [53]:
# @title

print("\nüîó Merging dados...")
if not df_events_ais.empty and 'sessionId' in df_consumo.columns:
    df_cons_sum = df_consumo.groupby('sessionId', as_index=False)['CONSUMED_QUANTITY'].sum()
    df_events_ais = df_events_ais.merge(df_cons_sum, on='sessionId', how='left')

if not df_events_ais.empty and not df_navios.empty:
    shipname_col = [c for c in df_navios.columns if 'nome' in c.lower() or 'name' in c.lower()]
    if shipname_col:
        snc = shipname_col[0]
        df_navios['ship_nav_low'] = df_navios[snc].astype(str).str.lower().str.strip()
        df_events_ais['ship_low'] = df_events_ais['shipName'].astype(str).str.lower().str.strip()
        df_events_ais = df_events_ais.merge(df_navios, left_on='ship_low', right_on='ship_nav_low', how='left')



üîó Merging dados...


## 7. PREPARAR DATASET ML

In [54]:
# @title

features_v2 = [
    'speed_mean', 'speed_std', 'speed_min', 'speed_max',
    'duration_h', 'distance',
    'frac_stop', 'frac_low_speed', 'idle_days', 'low_speed_days',
    'velocity_risk', 'operation_continuity', 'speed_variability',
    'low_shear_exposure', 'biofouling_risk_score',
    'beaufort', 'seaCondition', 'lat_mean', 'lon_mean',
    'temp_proxy', 'temp_risk', 'region_risk',
    'days_since_clean', 'fouling_stage',
    'displacement'
]

if 'CONSUMED_QUANTITY' in df_events_ais.columns:
    features_v2.append('CONSUMED_QUANTITY')

features_available = [f for f in features_v2 if f in df_events_ais.columns]

print(f"\nüìä Features dispon√≠veis: {len(features_available)}")

df_ml = df_events_ais.dropna(subset=['fouling_rating'])[features_available + ['fouling_rating', 'fouling_label', 'startGMTDate', 'shipName']].copy()
df_ml[features_available] = df_ml[features_available].fillna(0)

print(f"‚úÖ Dataset ML: {df_ml.shape}")



üìä Features dispon√≠veis: 26
‚úÖ Dataset ML: (4639, 30)


## 8.  VALIDA√á√ÉO TEMPORAL

In [55]:
# @title

print("\n‚è∞ Preparando valida√ß√£o temporal...")
df_ml_sorted = df_ml.sort_values('startGMTDate').reset_index(drop=True)

X = df_ml_sorted[features_available].values
y_reg = df_ml_sorted['fouling_rating'].values
y_clf = LabelEncoder().fit_transform(df_ml_sorted['fouling_label'].astype(str).values)

split_idx = int(len(df_ml_sorted) * 0.8)

X_train = X[:split_idx]
X_test = X[split_idx:]
y_train = y_reg[:split_idx]
y_test = y_reg[split_idx:]

print(f"‚úÖ Treino: {X_train.shape[0]} | Teste: {X_test.shape[0]}")



‚è∞ Preparando valida√ß√£o temporal...
‚úÖ Treino: 3711 | Teste: 928


## 9. üéØ MODELO ENSEMBLE

In [56]:
# @title

print("\n Treinando ensemble...")

models = {
    'XGBoost': xgb.XGBRegressor(
        n_estimators=300, learning_rate=0.03, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0
    ),
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=300, learning_rate=0.03, max_depth=6,
        random_state=42, verbosity=-1
    ),
    'RandomForest': RandomForestRegressor(
        n_estimators=200, max_depth=10, random_state=42, n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingRegressor(
        n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42
    )
}

predictions = {}
model_scores = {}

for name, model in models.items():
    print(f"\nTreinando {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    model_scores[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
    print(f"  MAE: {mae:.4f} | RMSE: {rmse:.4f} | R¬≤: {r2:.4f}")

# Ensemble com pesos
maes = [model_scores[name]['MAE'] for name in models.keys()]
weights = [1/mae for mae in maes]
weights = [w/sum(weights) for w in weights]

y_pred_ensemble = sum(predictions[name] * weight for name, weight in zip(models.keys(), weights))

mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
rmse_ensemble = np.sqrt(mean_squared_error(y_test, y_pred_ensemble))
r2_ensemble = r2_score(y_test, y_pred_ensemble)

print(f"\n{'='*60}")
print(" ENSEMBLE:")
print(f"  MAE: {mae_ensemble:.4f}")
print(f"  RMSE: {rmse_ensemble:.4f}")
print(f"  R¬≤: {r2_ensemble:.4f}")
print(f"{'='*60}")



 Treinando ensemble...

Treinando XGBoost...
  MAE: 0.0070 | RMSE: 0.0251 | R¬≤: 0.9992

Treinando LightGBM...
  MAE: 0.0072 | RMSE: 0.0222 | R¬≤: 0.9994

Treinando RandomForest...
  MAE: 0.0046 | RMSE: 0.0195 | R¬≤: 0.9995

Treinando GradientBoosting...
  MAE: 0.0042 | RMSE: 0.0152 | R¬≤: 0.9997

 ENSEMBLE:
  MAE: 0.0048
  RMSE: 0.0170
  R¬≤: 0.9996


## 10.  IMPACTO ECON√îMICO

In [57]:
# @title

def compute_fuel_penalty_from_fouling(fouling_rating, baseline_consumption):
    """Calcula penalidade baseado em IMO (5-25%)"""
    PRICE_PER_TON = 650
    CO2_PER_TON = 3.114

    penalty_map = {0: 0.00, 1: 0.065, 2: 0.10, 3: 0.15, 4: 0.215}

    if fouling_rating <= 0:
        penalty = 0
    elif fouling_rating >= 4:
        penalty = 0.25
    else:
        lower = int(fouling_rating)
        upper = min(lower + 1, 4)
        fraction = fouling_rating - lower
        penalty = penalty_map[lower] + fraction * (penalty_map[upper] - penalty_map[lower])

    extra_fuel = baseline_consumption * penalty

    return {
        'fouling_rating': fouling_rating,
        'fuel_penalty_pct': penalty * 100,
        'extra_fuel_tons_day': extra_fuel,
        'extra_cost_usd_day': extra_fuel * PRICE_PER_TON,
        'extra_cost_usd_month': extra_fuel * PRICE_PER_TON * 30,
        'extra_cost_usd_year': extra_fuel * PRICE_PER_TON * 365,
        'extra_co2_tons_year': extra_fuel * CO2_PER_TON * 365
    }

print("\n Calculando impacto econ√¥mico...")
baseline = 40
all_impacts = [compute_fuel_penalty_from_fouling(pred, baseline) for pred in y_pred_ensemble]
df_impacts = pd.DataFrame(all_impacts)

print(f"Custo Extra M√©dio/Dia: ${df_impacts['extra_cost_usd_day'].mean():,.2f}")
print(f"Custo Extra M√©dio/M√™s: ${df_impacts['extra_cost_usd_month'].mean():,.2f}")
print(f"Custo Extra M√©dio/Ano: ${df_impacts['extra_cost_usd_year'].mean():,.2f}")
print(f"CO2 Extra M√©dio/Ano: {df_impacts['extra_co2_tons_year'].mean():,.2f} tons")



 Calculando impacto econ√¥mico...
Custo Extra M√©dio/Dia: $5,145.31
Custo Extra M√©dio/M√™s: $154,359.24
Custo Extra M√©dio/Ano: $1,878,037.45
CO2 Extra M√©dio/Ano: 8,997.24 tons


## 11.  AN√ÅLISE DE CEN√ÅRIOS

In [58]:
# @title

def simulate_cleaning_scenarios(current_fouling, days_since_clean, baseline=40):
    """Simula cen√°rios de limpeza"""
    CLEANING_COST = 50000
    DOWNTIME_COST = 24 * 5000
    DAYS_AHEAD = 180

    scenarios = {}

    # Cen√°rio 1: N√£o fazer nada
    future_fouling = min(current_fouling + (DAYS_AHEAD / 90), 4.0)
    current_impact = compute_fuel_penalty_from_fouling(current_fouling, baseline)
    future_impact = compute_fuel_penalty_from_fouling(future_fouling, baseline)
    avg_cost = (current_impact['extra_cost_usd_day'] + future_impact['extra_cost_usd_day']) / 2

    scenarios['N√£o Fazer Nada'] = {
        'total_cost': avg_cost * DAYS_AHEAD,
        'final_fouling': future_fouling
    }

    # Cen√°rio 2: Limpar agora
    post_clean = 0.5
    future_clean = min(post_clean + (DAYS_AHEAD / 120), 2.5)
    post_impact = compute_fuel_penalty_from_fouling(post_clean, baseline)
    future_impact_clean = compute_fuel_penalty_from_fouling(future_clean, baseline)
    avg_cost_clean = (post_impact['extra_cost_usd_day'] + future_impact_clean['extra_cost_usd_day']) / 2

    scenarios['Limpar Agora'] = {
        'total_cost': CLEANING_COST + DOWNTIME_COST + (avg_cost_clean * DAYS_AHEAD),
        'final_fouling': future_clean
    }

    return scenarios

print("\n Simulando cen√°rios...")
example_fouling = y_pred_ensemble[0]
example_days = df_ml_sorted.iloc[split_idx]['days_since_clean']

scenarios = simulate_cleaning_scenarios(example_fouling, example_days)

print(f"\nFouling atual: {example_fouling:.2f}")
for name, data in scenarios.items():
    print(f"\n{name}:")
    print(f"  Custo total: ${data['total_cost']:,.2f}")
    print(f"  Fouling final: {data['final_fouling']:.2f}")

best = min(scenarios.items(), key=lambda x: x[1]['total_cost'])[0]
print(f"\n‚úÖ RECOMENDA√á√ÉO: {best}")



 Simulando cen√°rios...

Fouling atual: 4.00

N√£o Fazer Nada:
  Custo total: $1,088,091.23
  Fouling final: 4.00

Limpar Agora:
  Custo total: $480,050.00
  Fouling final: 2.00

‚úÖ RECOMENDA√á√ÉO: Limpar Agora


## 12. SALVAR MODELOS

In [59]:
# @title

print("\n Salvando modelos...")
for name, model in models.items():
    filename = f"model_{name.lower().replace(' ', '_')}_v2.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

metadata = {
    'features': features_available,
    'weights': dict(zip(models.keys(), weights)),
    'mae': mae_ensemble,
    'rmse': rmse_ensemble,
    'r2': r2_ensemble
}

with open('model_metadata_v2.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print(" Modelos salvos")



 Salvando modelos...
 Modelos salvos


In [60]:
# 13. RESUMO

In [61]:
# @title

print("\n" + "="*80)
print(" RESUMO SOLU√á√ÉO DE PREDI√á√ÉO")
print("="*80)

print(f"\n PERFORMANCE:")
print(f"  MAE:  {mae_ensemble:.4f}")
print(f"  RMSE: {rmse_ensemble:.4f}")
print(f"  R¬≤:   {r2_ensemble:.4f}")



 RESUMO SOLU√á√ÉO DE PREDI√á√ÉO

 PERFORMANCE:
  MAE:  0.0048
  RMSE: 0.0170
  R¬≤:   0.9996


## 13. FOULING RATING POR NAVIO (FROTA)

In [62]:
# @title

print("\n" + "="*80)
print("üö¢ FOULING RATING POR NAVIO DA FROTA")
print("="*80)

# Pegar √∫ltimo evento de cada navio
df_ml_sorted_final = df_ml_sorted.copy()

# Verificar se shipName existe, sen√£o usar √≠ndice
if 'shipName' not in df_ml_sorted_final.columns:
    # Adicionar shipName do df_ml original
    df_ml_sorted_final = df_ml_sorted_final.merge(
        df_ml[['startGMTDate', 'shipName']],
        on='startGMTDate',
        how='left'
    )

df_ml_sorted_final['shipName_clean'] = df_ml_sorted_final['shipName'].astype(str).str.strip()

# √öltimo evento de cada navio (mais recente)
ultimos_eventos = df_ml_sorted_final.groupby('shipName_clean').last().reset_index()

# Calcular impacto econ√¥mico para cada navio
resultados_frota = []

for _, navio_data in ultimos_eventos.iterrows():
    ship_name = navio_data['shipName_clean']
    fouling = navio_data['fouling_rating']
    days_clean = navio_data.get('days_since_clean', np.nan)

    # Calcular impacto
    baseline = 40  # tons/dia (ajustar se tiver dados espec√≠ficos)
    impacto = compute_fuel_penalty_from_fouling(fouling, baseline)

    # Classificar por escala IMO
    if fouling < 1:
        classificacao = "0-1: Sem/Micro"
        acao = "‚úÖ OK"
        urgencia = "üü¢"
    elif fouling < 2:
        classificacao = "1-2: Micro"
        acao = "üü° Limpeza Proativa"
        urgencia = "üü°"
    elif fouling < 3:
        classificacao = "2-3: Leve"
        acao = "üü† Limpeza Reativa"
        urgencia = "üü†"
    elif fouling < 4:
        classificacao = "3-4: Moderada"
        acao = "üî¥ Limpeza Urgente"
        urgencia = "üî¥"
    else:
        classificacao = "4: Pesada"
        acao = "üî¥ Limpeza CR√çTICA"
        urgencia = "üî¥"

    resultados_frota.append({
        'Navio': ship_name,
        'Fouling': round(fouling, 2),
        'Classifica√ß√£o': classificacao,
        'Dias Limpeza': int(days_clean) if not pd.isna(days_clean) else 'N/A',
        'Penalidade': f"{impacto['fuel_penalty_pct']:.1f}%",
        'Custo/M√™s': f"${impacto['extra_cost_usd_month']:,.0f}",
        'Custo/Ano': f"${impacto['extra_cost_usd_year']:,.0f}",
        'CO2/Ano': f"{impacto['extra_co2_tons_year']:,.0f}t",
        'A√ß√£o': acao,
        'Urg√™ncia': urgencia
    })

df_frota = pd.DataFrame(resultados_frota)

# Ordenar por Fouling Rating (maior primeiro)
df_frota = df_frota.sort_values('Fouling', ascending=False)

print("\n ESCALA IMO MEPC.378(80):")
print("  0: Sem bioincrusta√ß√£o")
print("  1: Microincrusta√ß√£o (biofilme/lodo/limo)")
print("  2: Macroincrusta√ß√£o leve (1-15% superf√≠cie)")
print("  3: Macroincrusta√ß√£o moderada (16-40% superf√≠cie)")
print("  4: Macroincrusta√ß√£o pesada (41-100% superf√≠cie)")

print("\n" + df_frota.to_string(index=False))

# Estat√≠sticas da frota
print("\n" + "="*80)
print(" ESTAT√çSTICAS DA FROTA")
print("="*80)

fouling_values = df_frota['Fouling'].values
print(f"\n Fouling Rating M√©dio da Frota: {fouling_values.mean():.2f}")
print(f" Fouling Rating M√≠nimo: {fouling_values.min():.2f}")
print(f" Fouling Rating M√°ximo: {fouling_values.max():.2f}")
print(f" Desvio Padr√£o: {fouling_values.std():.2f}")

# Distribui√ß√£o por categoria
clean_count = (fouling_values < 1).sum()
micro_count = ((fouling_values >= 1) & (fouling_values < 2)).sum()
leve_count = ((fouling_values >= 2) & (fouling_values < 3)).sum()
moderada_count = ((fouling_values >= 3) & (fouling_values < 4)).sum()
pesada_count = (fouling_values >= 4).sum()

total_navios = len(fouling_values)

print(f"\n Distribui√ß√£o por Categoria (Escala IMO):")
print(f"   üü¢ 0-1 (Sem/Micro):      {clean_count:2d} navios ({clean_count/total_navios*100:5.1f}%)")
print(f"   üü° 1-2 (Micro):          {micro_count:2d} navios ({micro_count/total_navios*100:5.1f}%)")
print(f"   üü† 2-3 (Leve):           {leve_count:2d} navios ({leve_count/total_navios*100:5.1f}%)")
print(f"   üî¥ 3-4 (Moderada):       {moderada_count:2d} navios ({moderada_count/total_navios*100:5.1f}%)")
print(f"   üî¥ 4   (Pesada):         {pesada_count:2d} navios ({pesada_count/total_navios*100:5.1f}%)")

# Prioriza√ß√£o de a√ß√µes
urgente_count = (fouling_values >= 3).sum()
monitorar_count = ((fouling_values >= 2) & (fouling_values < 3)).sum()
proativa_count = ((fouling_values >= 1) & (fouling_values < 2)).sum()

print(f"\n‚ö†Ô∏è A√á√ïES REQUERIDAS:")
print(f"   üî¥ {urgente_count} navios precisam LIMPEZA URGENTE (Fouling ‚â• 3.0)")
print(f"   üü† {monitorar_count} navios precisam LIMPEZA REATIVA (Fouling 2.0-3.0)")
print(f"   üü° {proativa_count} navios precisam LIMPEZA PROATIVA (Fouling 1.0-2.0)")

# Impacto econ√¥mico total
custo_ano_values = df_frota['Custo/Ano'].str.replace('$', '').str.replace(',', '').astype(float)
co2_ano_values = df_frota['CO2/Ano'].str.replace('t', '').str.replace(',', '').astype(float)

total_custo = custo_ano_values.sum()
total_co2 = co2_ano_values.sum()

print(f"\n IMPACTO ECON√îMICO TOTAL DA FROTA:")
print(f"   Custo Extra Total/Ano: ${total_custo:,.0f}")
print(f"   Custo Extra M√©dio/Navio: ${total_custo/total_navios:,.0f}")
print(f"   CO2 Extra Total/Ano: {total_co2:,.0f} toneladas")
print(f"   CO2 Extra M√©dio/Navio: {total_co2/total_navios:,.0f} toneladas")

# Salvar resultados detalhados
df_frota.to_csv('fouling_por_navio.csv', index=False)
print(f"\n Resultados salvos em: fouling_por_navio.csv")

print("\n" + "="*80)
print("SCRIPT CONCLU√çDO!")
print("="*80)




üö¢ FOULING RATING POR NAVIO DA FROTA

 ESCALA IMO MEPC.378(80):
  0: Sem bioincrusta√ß√£o
  1: Microincrusta√ß√£o (biofilme/lodo/limo)
  2: Macroincrusta√ß√£o leve (1-15% superf√≠cie)
  3: Macroincrusta√ß√£o moderada (16-40% superf√≠cie)
  4: Macroincrusta√ß√£o pesada (41-100% superf√≠cie)

            Navio  Fouling  Classifica√ß√£o  Dias Limpeza Penalidade Custo/M√™s  Custo/Ano CO2/Ano               A√ß√£o Urg√™ncia
       BRUNO LIMA     4.00      4: Pesada           565      25.0%  $195,000 $2,372,500 11,366t  üî¥ Limpeza CR√çTICA        üî¥
   DANIEL PEREIRA     4.00      4: Pesada          1739      25.0%  $195,000 $2,372,500 11,366t  üî¥ Limpeza CR√çTICA        üî¥
    EDUARDO COSTA     4.00      4: Pesada           316      25.0%  $195,000 $2,372,500 11,366t  üî¥ Limpeza CR√çTICA        üî¥
  VICTOR OLIVEIRA     4.00      4: Pesada          1726      25.0%  $195,000 $2,372,500 11,366t  üî¥ Limpeza CR√çTICA        üî¥
MARCOS CAVALCANTI     4.00      4: Pesada          