In [None]:
import tarfile

# Extract the tar.gz file
tar = tarfile.open('batch_instance.tar.gz', 'r:gz')
tar.extractall(path='extracted_batch_instance')
tar.close()


In [None]:
import tarfile

# Open the tar.gz file
with tarfile.open(r'D:\AliBaba\batch_instance.tar.gz', 'r:gz') as tar:
    members = tar.getmembers()           # Get list of all files in the archive
    total = len(members)
    limit = int(total * 0.2)             # 20% of total files

    # Extract only 20% of files
    selected_members = members[:limit]   
    tar.extractall(path='extracted_batch_instance', members=selected_members)


In [None]:
import pandas as pd

# Assign schema as column names
columns = [
    "instance_name", "task_name", "job_name", "task_type", "status",
    "start_time", "end_time", "machine_id", "seq_no", "total_seq_no",
    "cpu_avg", "cpu_max", "mem_avg", "mem_max"
]

df = pd.read_csv("extracted_batch_instance/batch_instance.csv", names=columns)
    

In [147]:
import pandas as pd

# File path
csv_path = "extracted_batch_instance/batch_instance.csv"

# Count total lines in the file (each line = 1 data row since no header)
# with open(csv_path, 'r') as f:
#     total_lines = sum(1 for _ in f)

# Calculate 20% of total rows
# nrows = int(total_lines * 0.2)

# Column names for the CSV (since no header)
columns = [
    "instance_name", "task_name", "job_name", "task_type", "status",
    "start_time", "end_time", "machine_id", "seq_no", "total_seq_no",
    "cpu_avg", "cpu_max", "mem_avg", "mem_max"
]

# Read only the first 20% of rows
df = pd.read_csv(csv_path, names=columns, nrows=1000000)


In [127]:
df.head(1000)

Unnamed: 0,instance_name,task_name,job_name,task_type,status,start_time,end_time,machine_id,seq_no,total_seq_no,cpu_avg,cpu_max,mem_avg,mem_max
0,ins_74901673,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,j_217,10,Terminated,673795,673797,m_2637,1,1,13.0,16.0,0.02,0.02
1,ins_815802872,M1,j_1527,1,Terminated,158478,158520,m_3430,1,1,3.0,19.0,0.13,0.18
2,ins_564677701,M1,j_2014,1,Terminated,372602,372616,m_1910,1,1,87.0,116.0,0.04,0.05
3,ins_257566161,M1,j_2014,1,Terminated,372602,372615,m_2485,1,1,91.0,123.0,0.05,0.05
4,ins_688679908,M1,j_2014,1,Terminated,372602,372615,m_993,1,1,93.0,141.0,0.05,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ins_207872151,M7_6,j_2745,1,Running,99680,100280,m_2892,1,1,,,,
996,ins_1181722370,M7_6,j_2745,1,Running,99680,100280,m_2937,1,1,,,,
997,ins_1255440598,M7_6,j_2745,1,Running,99680,100280,m_713,1,1,,,,
998,ins_902221200,M7_6,j_2745,1,Running,99680,100280,m_1303,1,1,,,,


In [148]:
unique_count = df['status'].unique()
print(unique_count)


['Terminated' 'Running' 'Failed' 'Interrupted']


In [149]:
import pandas as pd

df["start_time"] = df["start_time"].astype(int)
df["end_time"] = df["end_time"].astype(int)




In [150]:
df['status'].value_counts()

status
Terminated     991565
Running          7467
Failed            902
Interrupted        66
Name: count, dtype: int64

In [151]:
df["duration"] = df["end_time"] - df["start_time"]
df = df[df['duration'] >= 0]

In [152]:
df['status'].value_counts()


status
Terminated    991565
Running         7225
Failed           876
Name: count, dtype: int64

In [153]:
sample_df = df.groupby('status', group_keys=False).apply(lambda x: x.sample(n=600, random_state=42))






In [154]:
sample_df['status'].value_counts()


status
Failed        600
Running       600
Terminated    600
Name: count, dtype: int64

In [155]:
len(sample_df)

1800

In [156]:
sample_df.head()


Unnamed: 0,instance_name,task_name,job_name,task_type,status,start_time,end_time,machine_id,seq_no,total_seq_no,cpu_avg,cpu_max,mem_avg,mem_max,duration
159147,ins_671387916,M3_2,j_231121,1,Failed,338001,338006,m_1718,1,1,,,,,5
159440,ins_1297654664,M3_2,j_231121,1,Failed,338000,338009,m_1464,1,1,,,,,9
486559,ins_1015276430,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,j_723207,10,Failed,213126,213127,m_1665,1,1,,,,,1
159477,ins_191198371,M3_2,j_231121,1,Failed,338000,338006,m_3846,1,1,,,,,6
158995,ins_63632486,M3_2,j_231121,1,Failed,338001,338008,m_1250,1,1,,,,,7


In [157]:
import plotly.express as px

fig = px.bar(
    sample_df,
    x="instance_name",
    y="duration",
    color="status",
    title="Instance Duration by Status",
    hover_data={
        "instance_name": True,
        "task_name": True,
        "job_name": True,
        "status": True,
        "machine_id": True,
        "mem_avg": True,
        "cpu_avg": True,
        "duration": True
    },
    category_orders={"status": ["Terminated", "Running", "Failed", "Interrupted"]}
)

# Layout Tuning 
fig.update_layout(
    xaxis_title="Instance Name",
    yaxis_title="Duration (s)",
    xaxis={'categoryorder': 'total descending'},
    height=600,
    margin=dict(l=40, r=40, t=60, b=100),
    xaxis_tickangle=45,
    showlegend=True,
    dragmode="pan",  # enables scrolling/dragging
    legend_title_text='Status'
)

# Optional: make x-axis scrollable
fig.update_xaxes(
    rangeslider_visible=True,
    tickmode='auto',
    tickfont=dict(size=10)
)

fig.show()
