In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:




print("Loading event log data...")
try:
    event_log = pd.read_csv('proceset.csv')
    print(f"Successfully loaded {len(event_log)} events.")
    print("Sample data:")
    print(event_log.head())
except FileNotFoundError:
    print("Error: proceset.csv not found. Using sample data for demonstration.")
    
    np.random.seed(42)
    case_ids = [f"CASE_{i}" for i in range(1, 101)]
    activities = ["Register", "Review", "Validate", "Approve", "Reject", "Notify"]

    data = []
    for case_id in case_ids:
        
        num_activities = np.random.randint(3, 8)
        
        start_time = pd.Timestamp('2025-01-01') + pd.Timedelta(days=np.random.randint(0, 120))

        
        for i in range(num_activities):
            activity = np.random.choice(activities)
            
            timestamp = start_time + pd.Timedelta(days=np.random.randint(0, 5))
            resource = f"RESOURCE_{np.random.randint(1, 6)}"
            start_time = timestamp  

            data.append({
                'case_id': case_id,
                'activity': activity,
                'timestamp': timestamp,
                'resource': resource
            })

    event_log = pd.DataFrame(data)
    event_log['timestamp'] = pd.to_datetime(event_log['timestamp'])
    print("Created sample event log data:")
    print(event_log.head())


if not pd.api.types.is_datetime64_any_dtype(event_log['Timestamp']):
    event_log['Timestamp'] = pd.to_datetime(event_log['Timestamp'])


Loading event log data...
Successfully loaded 75 events.
Sample data:
   CaseID                       Activity          InitialStatus  \
0       1       Access to the DAP portal                  Start   
1       1      Selection of the position               Accessed   
2       1  Start preparing the documents      Position Selected   
3       1             Submit Application     Documents Prepared   
4       1     Receiving the applications  Application Submitted   

             FinalStatus                       ProcessFlow       Timestamp  
0               Accessed                  Start of process  3/30/2025 8:00  
1      Position Selected      Applicant selects a position  3/30/2025 8:05  
2     Documents Prepared       Documents preparation phase  3/30/2025 8:30  
3  Application Submitted         Submission of application  3/30/2025 9:00  
4   Application Received  Institution receives application  3/30/2025 9:30  


In [None]:

print("\nExtracting case-level features...")


case_features = []

for case_id, case_df in event_log.groupby('CaseID'):
    
    case_df = case_df.sort_values('Timestamp')

    
    start_time = case_df['Timestamp'].min()
    end_time = case_df['Timestamp'].max()

    
    duration = (end_time - start_time).total_seconds() / (60*60*24)  

    
    num_events = len(case_df)

    
    num_unique_activities = case_df['Activity'].nunique()

    
    num_resources = case_df['resource'].nunique() if 'resource' in case_df.columns else 0

   
    if num_events > 1:
        time_diffs = []
        timestamps = case_df['Timestamp'].tolist()
        for i in range(1, len(timestamps)):
            diff = (timestamps[i] - timestamps[i-1]).total_seconds() / (60*60*24)
            time_diffs.append(diff)
        avg_time_between = sum(time_diffs) / len(time_diffs)
    else:
        avg_time_between = 0

  
    has_reject = 1 if "Reject" in case_df['Activity'].values else 0

    
    activity_transitions = 0
    if num_events > 1:
        activities = case_df['Activity'].tolist()
        for i in range(1, len(activities)):
            if activities[i] != activities[i-1]:
                activity_transitions += 1

   
    case_features.append({
        'CaseID': case_id,
        'duration': duration,
        'num_events': num_events,
        'num_unique_activities': num_unique_activities,
        'num_resources': num_resources,
        'avg_time_between': avg_time_between,
        'has_reject': has_reject,
        'activity_transitions': activity_transitions
    })


case_features_df = pd.DataFrame(case_features)
print("Created case features:")
print(case_features_df.head())
print(f"Total cases: {len(case_features_df)}")


Extracting case-level features...
Created case features:
   CaseID  duration  num_events  num_unique_activities  num_resources  \
0       1  3.333333          12                     12              0   
1       2  0.107639           7                      7              0   
2       3  0.111111           8                      8              0   
3       4  3.291667          12                     12              0   
4       5  1.125000          11                     10              0   

   avg_time_between  has_reject  activity_transitions  
0          0.303030           0                    11  
1          0.017940           0                     6  
2          0.015873           0                     7  
3          0.299242           0                    11  
4          0.112500           0                    10  
Total cases: 8


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt


features = case_features_df.select_dtypes(include=[np.number])


scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

max_k = min(10, X_scaled.shape[0])
k_range = range(1, max_k + 1)

inertia = []
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.savefig('elbow_method.png')
plt.close()

if len(inertia) > 2:
    inertia_diff = np.diff(inertia)
    inertia_diff2 = np.diff(inertia_diff)
    optimal_k = np.argmin(np.abs(inertia_diff2)) + 2
else:
    optimal_k = 1

print(f"Estimated optimal number of clusters (k): {optimal_k}")

print(f"Applying K-Means with k={optimal_k}...")
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = final_kmeans.fit_predict(X_scaled)

case_features_df['cluster'] = clusters


Estimated optimal number of clusters (k): 6
Applying K-Means with k=6...


In [None]:

max_k = min(10, X_scaled.shape[0])  
k_range = range(1, max_k + 1)

inertia = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)


plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.savefig('elbow_method.png')
plt.close()


if len(inertia) > 2:
    inertia_diff = np.diff(inertia)
    inertia_diff2 = np.diff(inertia_diff)
    optimal_k = np.argmin(np.abs(inertia_diff2)) + 2 
else:
    optimal_k = 1  

print(f"Estimated optimal number of clusters (k): {optimal_k}")


print(f"Applying K-Means with k={optimal_k}...")
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = final_kmeans.fit_predict(X_scaled)


case_features_df['cluster'] = clusters


Estimated optimal number of clusters (k): 6
Applying K-Means with k=6...


In [None]:

features = ['duration', 'num_events', 'num_unique_activities', 'avg_time_between']


cluster_stats = case_features_df.groupby('cluster')[features].mean()
print("\nCluster centers (mean values):")
print(cluster_stats)



Cluster centers (mean values):
         duration  num_events  num_unique_activities  avg_time_between
cluster                                                               
0        0.109375         7.5                    7.5          0.016906
1        3.312500        12.0                   12.0          0.301136
2        0.006944         3.0                    3.0          0.003472
3        1.125000        11.0                   10.0          0.112500
4        2.166667        10.0                   10.0          0.240741
5        3.125000        12.0                   12.0          0.284091


In [None]:

print("\nAnalyzing cluster characteristics...")


cluster_stats = case_features_df.groupby('cluster')[features].mean()
print("\nCluster centers (mean values):")
print(cluster_stats)


cluster_counts = case_features_df['cluster'].value_counts().sort_index()
print("\nNumber of cases in each cluster:")
print(cluster_counts)


plt.figure(figsize=(12, 8))


feature1 = 'duration'  # x-axis
feature2 = 'num_events'  # y-axis


plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    case_features_df[feature1],
    case_features_df[feature2],
    c=case_features_df['cluster'],
    cmap='viridis',
    alpha=0.7,
    s=100
)

plt.colorbar(scatter, label='Cluster')
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.title(f'K-Means Clustering of Process Cases (k={optimal_k})')
plt.grid(True, alpha=0.3)
plt.savefig('cluster_visualization_2d.png')
plt.close()

vis_features = ['duration', 'num_events', 'num_unique_activities', 'avg_time_between']
vis_df = case_features_df[vis_features + ['cluster']].copy()


vis_df['cluster'] = vis_df['cluster'].astype('category')


plt.figure(figsize=(12, 10))
pair_plot = sns.pairplot(vis_df, hue='cluster', palette='viridis', corner=True)
pair_plot.fig.suptitle(f'Pairwise Relationships in Clusters (k={optimal_k})', y=1.02)
plt.savefig('cluster_pairplot.png')
plt.close()

plt.figure(figsize=(12, 8))
sns.heatmap(cluster_stats, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title(f'Cluster Centers Heatmap (k={optimal_k})')
plt.savefig('cluster_centers_heatmap.png')
plt.close()



Analyzing cluster characteristics...

Cluster centers (mean values):
         duration  num_events  num_unique_activities  avg_time_between
cluster                                                               
0        0.109375         7.5                    7.5          0.016906
1        3.312500        12.0                   12.0          0.301136
2        0.006944         3.0                    3.0          0.003472
3        1.125000        11.0                   10.0          0.112500
4        2.166667        10.0                   10.0          0.240741
5        3.125000        12.0                   12.0          0.284091

Number of cases in each cluster:
cluster
0    2
1    2
2    1
3    1
4    1
5    1
Name: count, dtype: int64


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

In [None]:

print("\nDetailed analysis of each cluster:")

for cluster_id in range(optimal_k):
    cluster_cases = case_features_df[case_features_df['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} Analysis:")
    print(f"  Number of cases: {len(cluster_cases)}")
    print(f"  Percentage of total: {len(cluster_cases) / len(case_features_df) * 100:.2f}%")

    
    for feature in features:
        mean_val = cluster_cases[feature].mean()
        std_val = cluster_cases[feature].std()
        print(f"  {feature}: mean={mean_val:.2f}, std={std_val:.2f}")

    
    sample_cases = cluster_cases['CaseID'].values[:5]
    print(f"  Sample cases: {', '.join(map(str, sample_cases))}")

print("\nAdding cluster information to the original event log...")

case_cluster_map = dict(zip(case_features_df['CaseID'], case_features_df['cluster']))


event_log['cluster'] = event_log['CaseID'].map(case_cluster_map)


event_log.to_csv('event_log_with_clusters.csv', index=False)
print("Enhanced event log saved as 'event_log_with_clusters.csv'")


case_features_df.to_csv('case_features_with_clusters.csv', index=False)
print("Case features with clusters saved as 'case_features_with_clusters.csv'")

print("\nClustering analysis complete!")


Detailed analysis of each cluster:

Cluster 0 Analysis:
  Number of cases: 2
  Percentage of total: 25.00%
  duration: mean=0.11, std=0.00
  num_events: mean=7.50, std=0.71
  num_unique_activities: mean=7.50, std=0.71
  avg_time_between: mean=0.02, std=0.00
  Sample cases: 2, 3

Cluster 1 Analysis:
  Number of cases: 2
  Percentage of total: 25.00%
  duration: mean=3.31, std=0.03
  num_events: mean=12.00, std=0.00
  num_unique_activities: mean=12.00, std=0.00
  avg_time_between: mean=0.30, std=0.00
  Sample cases: 1, 4

Cluster 2 Analysis:
  Number of cases: 1
  Percentage of total: 12.50%
  duration: mean=0.01, std=nan
  num_events: mean=3.00, std=nan
  num_unique_activities: mean=3.00, std=nan
  avg_time_between: mean=0.00, std=nan
  Sample cases: 8

Cluster 3 Analysis:
  Number of cases: 1
  Percentage of total: 12.50%
  duration: mean=1.12, std=nan
  num_events: mean=11.00, std=nan
  num_unique_activities: mean=10.00, std=nan
  avg_time_between: mean=0.11, std=nan
  Sample cases: 