# GA-2 (Tiered) – Colab Notebook

> This notebook mirrors the original GA-2 code with minimal/no logic changes.
> If your data is on Google Drive, mount Drive and adjust the file path.


In [None]:
# GA-2 (Tiered) – Colab-friendly, minimal changes, includes full pipeline

# Imports (align with GA-1 to keep consistency)
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt

# Load the same dataset path as GA-1 (adjust if needed)
# DATASET_PATH = "/content/drive/MyDrive/THESIS/DATASET_THESIS/Variant-3_RIG-6_RS_Beka_2024-2025.xlsx"
# df = pd.read_excel(DATASET_PATH)
df = pd.read_excel("Variant-4_RIG-24_WOWS_Mina_2022-2024.xlsx")

# --- Original GA-2 code starts here (kept intact) ---

def calculate_objective(route, matrix, bopd_list, duration_days, use_bopd=True):
    # Fungsi fitness/objektif GA
    total_score = 0
    for i in range(len(route)):
        # Indeks sumur saat ini
        well_idx = route[i]
        # Sumur pertama tidak butuh waktu tempuh
        travel_time = 0 if i == 0 else matrix[route[i - 1]][well_idx] / 20
        job_time = duration_days[well_idx]
        total_time = travel_time + job_time # Total waktu eksekusi
        production = bopd_list[well_idx] # Produksi dari sumur tersebut
        if use_bopd:
            score = total_time / production if production > 0 else float('inf') # BOPD  dipertimbangkan dalam penilaian fitness
        else:
            score = total_time # Fitness hanya mempertimbangkan waktu total
        total_score += score
    return total_score

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius bumi dalam kilometer
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi/2.0)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c  # jarak dalam km

def create_distance_matrix(locations):
    n = len(locations)
    matrix = np.zeros((n, n))
    for i in range(n):
        lat1, lon1 = locations[i]
        for j in range(n):
            if i != j:
                lat2, lon2 = locations[j]
                matrix[i][j] = haversine(lat1, lon1, lat2, lon2)
    return matrix  # dalam km

# Hapus baris dengan data yang tidak lengkap
df_clean = df.dropna(subset=['SURFACE_LATITUDE', 'SURFACE_LONGITUDE', 'WELL_ALIAS'])

# Siapkan data bersih
locations = list(zip(df_clean['SURFACE_LATITUDE'], df_clean['SURFACE_LONGITUDE']))
well_ids = df_clean['WELL_ALIAS'].values
distance_matrix = create_distance_matrix(locations)
df_matrix = pd.DataFrame(distance_matrix, index=well_ids,
                         columns=well_ids)
# Tampilkan 5×5 sub‐tabel pertama dengan pembulatan 2 desimal
print(df_matrix.round(2).iloc[:6, :6])

# Visualisasi heatmap seluruh matriks jarak
plt.figure(figsize=(6, 5))
sns.heatmap(df_matrix, cmap='viridis', linewidths=0.5)
plt.title("Heatmap Matriks Jarak antar Sumur")
plt.xlabel('WELL_ALIAS')
plt.ylabel('WELL_ALIAS')
plt.show()

def create_route(n):
    route = list(range(1, n))  # semua indeks kecuali 0
    random.shuffle(route)
    return [0] + route  # selalu mulai dari indeks 0 (sumur BOPD tertinggi)

def crossover(p1, p2):
# Fungsi crossover
    size = len(p1)
    # Titik potong
    start, end = sorted(random.sample(range(size), 2))
    child = [-1]*size
    child[start:end] = p1[start:end]
    fill = [item for item in p2 if item not in child]
    pointer = 0
    for i in range(size):
        if child[i] == -1:
            child[i] = fill[pointer]
            pointer += 1
    return child

def mutate(route, rate=0.02):
# Fungsi mutasi dengan probabilitas
    for i in range(len(route)):
        if random.random() < rate:
            # Pilih dua titik acak
            j = random.randint(0, len(route)-1)
            # Tukar posisi
            route[i], route[j] = route[j], route[i]
    return route

def genetic_algorithm(matrix, bopd_list, duration_days, generations=200, pop_size=300, use_bopd=True):
    # Fungsi main GA
    best_scores = []
    population = [create_route(len(matrix)) for _ in range(pop_size)]
    for gen in range(generations):
        # Urutkan populasi berdasarkan fitness
        population.sort(key=lambda x: calculate_objective(x, matrix, bopd_list, duration_days, use_bopd))
        best_scores.append(calculate_objective(population[0], matrix, bopd_list, duration_days, use_bopd))
        # Ambil 5 terbaik
        next_gen = population[:5]
        while len(next_gen) < pop_size:
            parents = random.sample(population[:20], 2) # Seleksi parent
            child = mutate(crossover(parents[0], parents[1])) # Crossover dan mutasi
            next_gen.append(child)
        population = next_gen
    best_route = min(population, key=lambda x: calculate_objective(x, matrix, bopd_list, duration_days, use_bopd))
    best_score = calculate_objective(best_route, matrix, bopd_list, duration_days, use_bopd)

    return best_route, best_score, best_scores

# Input untuk GA

# Validasi
df_valid = df[(df['BOPD'] > 0) & df['SURFACE_LATITUDE'].notnull() & df['SURFACE_LONGITUDE'].notnull()].copy()

# 2. Tambahkan noise ke koordinat duplikat
duplicate_mask = df_valid.duplicated(subset=['SURFACE_LATITUDE', 'SURFACE_LONGITUDE'], keep=False)
df_valid.loc[duplicate_mask, 'SURFACE_LATITUDE'] += np.random.uniform(-0.0001, 0.0001, size=duplicate_mask.sum())
df_valid.loc[duplicate_mask, 'SURFACE_LONGITUDE'] += np.random.uniform(-0.0001, 0.0001, size=duplicate_mask.sum())

# Hitung durasi kerja dalam hari untuk calculate GA
df_valid['duration_days'] = (
    (df_valid['END_DATETIME_JOB'] - df_valid['START_DATETIME_JOB'] - pd.to_timedelta(df_valid['MOVING_TIME'], unit='D'))
    .dt.total_seconds() / 86400
).fillna(2)

bopd_list = df_valid['BOPD'].tolist()  # Produksi BOPD
duration_days = df_valid['duration_days'].tolist()  # Durasi kerja
locations = list(zip(df_valid['SURFACE_LATITUDE'], df_valid['SURFACE_LONGITUDE']))  # Koordinat
duration_days = df_valid['duration_days'].tolist()

# Tiering berdasarkan BOPD
# Hitung kuantil berdasarkan BOPD dari dataset saat ini
q1 = df_valid['BOPD'].quantile(0.25)
q3 = df_valid['BOPD'].quantile(0.75)

# Fungsi klasifikasi tier yang fleksibel
def assign_tier(bopd):
    if bopd > q3:
        return 'Tier1'
    elif bopd > q1:
        return 'Tier2'
    else:
        return 'Tier3'

# Buat kolom TIER yang selalu update
df_valid['TIER'] = df_valid['BOPD'].apply(assign_tier)

# Cek jumlah per tier
print(df_valid['TIER'].value_counts())

# Tier Processing
# Tier 1 Tidak dioptimasi dengan GA, langsung diurutkan berdasarkan BOPD tertinggi
df_t1 = df_valid[df_valid['TIER'] == 'Tier1'].sort_values(by='BOPD', ascending=False)

# Tier 2
df_t2 = df_valid[df_valid['TIER'] == 'Tier2'].sort_values(by='BOPD', ascending=False).reset_index(drop=True)
locs_t2 = list(zip(df_t2['SURFACE_LATITUDE'], df_t2['SURFACE_LONGITUDE']))
matrix_t2 = create_distance_matrix(locs_t2)
bopd_t2 = df_t2['BOPD'].tolist()
durasi_t2 = df_t2['duration_days'].tolist()

# Genetic Algorithm Tier 2
route_t2, score_t2, scores_t2 = genetic_algorithm(matrix_t2, bopd_t2, durasi_t2, use_bopd=True)
df_ga_t2 = df_t2.iloc[route_t2]

# Tier 3
df_t3 = df_valid[df_valid['TIER'] == 'Tier3'].reset_index(drop=True)
locs_t3 = list(zip(df_t3['SURFACE_LATITUDE'], df_t3['SURFACE_LONGITUDE']))
matrix_t3 = create_distance_matrix(locs_t3)
bopd_t3 = df_t3['BOPD'].tolist()
durasi_t3 = df_t3['duration_days'].tolist()

# Genetic Algorithm Tier 3
route_t3, score_t3, scores_t3 = genetic_algorithm(matrix_t3, bopd_t3, durasi_t3, use_bopd=False)
df_ga_t3 = df_t3.iloc[route_t3]

# Convergence prints
initial_score_t2 = scores_t2[0]
final_score_t2 = scores_t2[-1]
print(f"Score Awal Tier 2 (Generasi ke-0): {initial_score_t2:.2f}")
print(f"Score Akhir Tier 2 (Generasi ke-{len(scores_t2)-1}): {final_score_t2:.2f}")

initial_score_t3 = scores_t3[0]
final_score_t3 = scores_t3[-1]
print(f"Score Awal Tier 3 (Generasi ke-0): {initial_score_t3:.2f}")
print(f"Score Akhir Tier 3 (Generasi ke-{len(scores_t3)-1}): {final_score_t3:.2f}")

# Combine final route (Tier1 + GA Tier2 + GA Tier3) and simulate timing
from math import radians, sin, cos, sqrt, atan2

df_ga_tuning = pd.concat([df_t1, df_ga_t2, df_ga_t3], ignore_index=True).reset_index(drop=True)

t0 = pd.to_datetime(df_valid['START_DATETIME_JOB'].min())

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

speed_km_per_day = 20
travel_times = [0]
for i in range(1, len(df_ga_tuning)):
    lat1, lon1 = df_ga_tuning.iloc[i-1][['SURFACE_LATITUDE', 'SURFACE_LONGITUDE']]
    lat2, lon2 = df_ga_tuning.iloc[i][['SURFACE_LATITUDE', 'SURFACE_LONGITUDE']]
    distance = haversine_km(lat1, lon1, lat2, lon2)
    travel_time = distance / speed_km_per_day
    travel_times.append(travel_time)

df_ga_tuning['travel_time'] = travel_times
df_ga_tuning['total_time'] = df_ga_tuning['duration_days'] + df_ga_tuning['travel_time']
df_ga_tuning['cum_time'] = df_ga_tuning['total_time'].cumsum()
df_ga_tuning['executed_date'] = t0 + pd.to_timedelta(df_ga_tuning['cum_time'], unit='D')
df_ga_tuning['executed_month'] = df_ga_tuning['executed_date'].dt.to_period('M')
df_ga_tuning['executed_year'] = df_ga_tuning['executed_date'].dt.year

print("Urutan rute hasil kombinasi tier:")
print(" → ".join(df_ga_tuning['WELL_ALIAS'].tolist()))
print("Travel time max:", df_ga_tuning['travel_time'].max(), "days")
print("Total waktu eksekusi:", df_ga_tuning['cum_time'].iloc[-1], "days")

# Simple year-wise comparison plots (jobs & production)
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

years = sorted(set(df_ga_tuning['executed_year'].unique()) | set(df['START_DATETIME_JOB'].dt.year.unique()))

for year in years:
    baseline = df[df['START_DATETIME_JOB'].dt.year == year].copy()
    ga_year = df_ga_tuning[df_ga_tuning['executed_year'] == year].copy()

    baseline['Month_Label'] = baseline['START_DATETIME_JOB'].dt.strftime('%B')
    ga_year['Month_Label'] = ga_year['executed_date'].dt.strftime('%B')
    baseline['Month_Sort'] = baseline['START_DATETIME_JOB'].dt.to_period('M').astype(str)
    ga_year['Month_Sort'] = ga_year['executed_date'].dt.to_period('M').astype(str)

    jobs_b = baseline.groupby(['Month_Sort', 'Month_Label']).size().reset_index(name='Baseline Jobs')
    jobs_g = ga_year.groupby(['Month_Sort', 'Month_Label']).size().reset_index(name='GA Jobs')
    jobs_compare = pd.merge(jobs_b, jobs_g, on=['Month_Sort', 'Month_Label'], how='outer').fillna(0).sort_values('Month_Sort')

    avg_jobs_baseline = jobs_compare['Baseline Jobs'].mean()
    avg_jobs_ga = jobs_compare['GA Jobs'].mean()
    total_jobs_baseline = jobs_compare['Baseline Jobs'].sum()
    total_jobs_ga = jobs_compare['GA Jobs'].sum()

    ax = jobs_compare.plot(x='Month_Label', kind='bar', stacked=False, figsize=(14,6),
                           color=['steelblue','green'])

    plt.axhline(y=avg_jobs_baseline, color='blue', linestyle='--', linewidth=2,
                label=f'Avg Baseline : {avg_jobs_baseline:.1f} jobs/month')
    plt.axhline(y=avg_jobs_ga, color='darkmagenta', linestyle='--', linewidth=2,
                label=f'Avg GA : {avg_jobs_ga:.1f} jobs/month')

    plt.plot([], [], color='steelblue', label=f'Baseline (Total: {int(total_jobs_baseline)} jobs)')
    plt.plot([], [], color='darkmagenta', label=f'GA (Total: {int(total_jobs_ga)} jobs)')

    plt.title(f'# Execution Rate per Month: Baseline vs Genetic Algorithm (GA-2)')
    plt.ylabel('# Jobs Executed')
    plt.xlabel('Execution Date')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    prod_b = baseline.groupby(['Month_Sort', 'Month_Label'])['BOPD'].sum().reset_index(name='Baseline Prod')
    prod_g = ga_year.groupby(['Month_Sort', 'Month_Label'])['BOPD'].sum().reset_index(name='GA Prod')
    prod_compare = pd.merge(prod_b, prod_g, on(['Month_Sort', 'Month_Label']), how='outer').fillna(0).sort_values('Month_Sort')

    avg_prod_baseline = prod_compare['Baseline Prod'].mean()
    avg_prod_ga = prod_compare['GA Prod'].mean()
    total_prod_baseline = prod_compare['Baseline Prod'].sum()
    total_prod_ga = prod_compare['GA Prod'].sum()

    plt.figure(figsize=(14,6))
    plt.plot(prod_compare['Month_Label'], prod_compare['Baseline Prod'], marker='o', color='steelblue',
             label=f'Baseline (Total: {int(total_prod_baseline)} BOPD)')
    plt.plot(prod_compare['Month_Label'], prod_compare['GA Prod'], marker='o', color='green',
             label=f'GA (Total: {int(total_prod_ga)} BOPD)')

    plt.axhline(y=avg_prod_baseline, color='blue', linestyle='--', linewidth=2,
                label=f'Avg Baseline Prod : {avg_prod_baseline:.1f} BOPD')
    plt.axhline(y=avg_prod_ga, color='darkmagenta', linestyle='--', linewidth=2,
                label=f'Avg GA Prod : {avg_prod_ga:.1f} BOPD')

    plt.title(f'Total Oil Recovery per Month: Baseline vs Genetic Algorithm (GA-2)')
    plt.ylabel('Total Oil BOPD')
    plt.xlabel('Execution Date')
    plt.grid(True)
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
