In [None]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 140)

In [None]:
path = r"merged_factorio.csv"  # aynı klasördeyse
# path = r"C:\Users\aysen\OneDrive\Masaüstü\data mining proje factorio\merged_factorio.csv"  # gerekirse tam yol

df0 = pd.read_csv(path)
print("raw shape:", df0.shape)
df0.head(3)

In [None]:
df = (
    df0.sort_values("tick")
      .groupby("tick", as_index=False)
      .last()
      .copy()
)

print("rows after dedup:", len(df))
print("tick min:", df["tick"].min(), "tick max:", df["tick"].max())
print("columns:", len(df.columns))
df[["tick"] + [c for c in ["time_s", "logsim_power", "pollution"] if c in df.columns]].head(5)

In [None]:
count_cols = [c for c in df.columns if c.endswith("_count")]
print("count cols:", len(count_cols))

first_valid_tick = None
if count_cols:
    any_count_nonnull = df[count_cols].notna().any(axis=1)
    if any_count_nonnull.any():
        first_valid_tick = int(df.loc[any_count_nonnull, "tick"].iloc[0])

print("first_valid_tick:", first_valid_tick)

core_start = first_valid_tick if first_valid_tick is not None else int(df["tick"].min())
df = df[df["tick"] >= core_start].copy()

print("core rows:", len(df), "core_start:", core_start)
df.head(3)

In [None]:
count_cols = [c for c in df.columns if c.endswith("_count")]
filled = df[count_cols].ffill().fillna(0)

per_min = filled.diff().div(df["delta_min"], axis=0)
per_min = per_min.replace([np.inf, -np.inf], np.nan).fillna(0)
per_min.columns = [c.replace("_count", "_per_min") for c in count_cols]

avail = df[count_cols].notna().astype(int)
avail.columns = [c.replace("_count", "__avail") for c in count_cols]

df = pd.concat([df, avail, per_min], axis=1).copy()

print("per_min cols:", per_min.shape[1])
df[["tick"] + per_min.columns[:8].tolist()].head(10)

In [None]:
per_min_cols = [c for c in df.columns if c.endswith("_per_min") and c.startswith("M")]
item_cols = [c for c in df.columns if c.endswith("_item") and c.startswith("M")]

print("per_min machine cols:", len(per_min_cols))
print("item name cols:", len(item_cols))

# machine_id -> item name mapping for each row
# örnek: M01_item = "Gear", M01_per_min = üretim hızı
machine_ids = sorted({c.split("_")[0] for c in per_min_cols})
print("machines found:", len(machine_ids), "sample:", machine_ids[:10])

# item bazlı toplam per_min kolonlarını üret
items_total = {}

for mid in machine_ids:
    item_col = f"{mid}_item"
    rate_col = f"{mid}_per_min"
    if item_col not in df.columns or rate_col not in df.columns:
        continue

    # her satırda item ismine göre ilgili rate ekle
    tmp_item = df[item_col].fillna("UNKNOWN")
    tmp_rate = df[rate_col].fillna(0)

    # unique item names for this machine
    for it in tmp_item.unique():
        key = f"{it}_total_per_min"
        mask = (tmp_item == it)
        if key not in items_total:
            items_total[key] = tmp_rate.where(mask, 0.0)
        else:
            items_total[key] = items_total[key] + tmp_rate.where(mask, 0.0)

items_df = pd.DataFrame(items_total)
df = pd.concat([df, items_df], axis=1).copy()

print("item_total columns:", items_df.shape[1])
df[["tick"] + list(items_df.columns[:10])].head(5)

In [None]:
item_rate_cols = [c for c in df.columns if c.endswith("_total_per_min")]
print("item_rate_cols:", len(item_rate_cols))

# bazıları sadece UNKNOWN olabilir, yine de kalsın
items = df[item_rate_cols].fillna(0)
items.describe().T.sort_values("mean", ascending=False).head(15)

In [None]:
w = 5  # rolling window (5 samples)

new = pd.DataFrame(index=df.index)
new["throughput_total_per_min"] = items.sum(axis=1)

if "logsim_power" in df.columns:
    power = df["logsim_power"].replace(0, np.nan)
else:
    power = np.nan

new["efficiency"] = new["throughput_total_per_min"] / power

new["throughput_ma"] = new["throughput_total_per_min"].rolling(w, min_periods=1).mean()
new["power_ma"] = df["logsim_power"].rolling(w, min_periods=1).mean() if "logsim_power" in df.columns else np.nan
new["eff_ma"] = new["efficiency"].rolling(w, min_periods=1).mean()

new["max_item_rate"] = items.max(axis=1)
new["share_max"] = new["max_item_rate"] / new["throughput_total_per_min"].replace(0, np.nan)
new["imbalance_std"] = items.std(axis=1)

df = pd.concat([df, new], axis=1).copy()

df[["tick","logsim_power","throughput_total_per_min","efficiency","share_max","imbalance_std"]].head(10)

In [None]:
df["dominant_item"] = items.idxmax(axis=1)
df["dominant_rate"] = items.max(axis=1)

df[["tick","throughput_total_per_min","dominant_item","dominant_rate"]].head(15)

In [None]:
eff = df["eff_ma"].replace([np.inf, -np.inf], np.nan)
eff_mean = eff.mean()
eff_std = eff.std(ddof=0)

df["eff_z"] = (eff - eff_mean) / (eff_std if eff_std and eff_std > 0 else 1.0)

df["bn_low_eff"] = df["eff_z"] < -2
df["bn_high_imbalance"] = df["share_max"] > 0.60

# score: power high, throughput low, imbalance high
p_rank = df["power_ma"].rank(pct=True) if "power_ma" in df.columns else 0
t_rank = df["throughput_ma"].rank(pct=True)
i_rank = df["imbalance_std"].rank(pct=True)

df["bottleneck_score"] = (p_rank - t_rank + i_rank)

bn = df[df["bn_low_eff"] | df["bn_high_imbalance"]].copy()
print("bottleneck rows:", len(bn))

bn[["tick","logsim_power","throughput_total_per_min","eff_ma","eff_z","share_max","imbalance_std","dominant_item","dominant_rate","bottleneck_score"]].head(25)

In [None]:
top_bn = bn.sort_values("bottleneck_score", ascending=False).head(20)
top_bn[["tick","logsim_power","throughput_total_per_min","eff_ma","share_max","imbalance_std","dominant_item","dominant_rate","bottleneck_score"]]

In [None]:
bn["dominant_item"].value_counts().head(15)

In [None]:
normal = df[~(df["bn_low_eff"] | df["bn_high_imbalance"])].copy()

bn_items = bn["dominant_item"].value_counts(normalize=True)
normal_items = normal["dominant_item"].value_counts(normalize=True)

compare = pd.concat([bn_items, normal_items], axis=1)
compare.columns = ["bottleneck_share", "normal_share"]
compare = compare.fillna(0)
compare["lift"] = (compare["bottleneck_share"] + 1e-9) / (compare["normal_share"] + 1e-9)

compare.sort_values("lift", ascending=False).head(20)

In [None]:
# bottleneck anlarında dominant item rate ortalaması
dom_mean_bn = bn.groupby("dominant_item")["dominant_rate"].mean().sort_values(ascending=False)

# aynı item normalde ne kadar üretiyor
dom_mean_normal = normal.groupby("dominant_item")["dominant_rate"].mean()

rec = pd.concat([dom_mean_bn, dom_mean_normal], axis=1)
rec.columns = ["mean_rate_bn", "mean_rate_normal"]
rec = rec.fillna(0)

# öneri: bottleneckte daha fazla dominantlaşıyorsa kapasite artır
rec["suggested_capacity_increase_pct"] = np.where(
    rec["mean_rate_normal"] > 0,
    100 * (rec["mean_rate_bn"] - rec["mean_rate_normal"]) / rec["mean_rate_normal"],
    np.nan
)

rec.sort_values("suggested_capacity_increase_pct", ascending=False).head(20)