# Mid-Project Presented By Amany Arafa
# Database used is ***50k Bug Dataset***

### Step 1: Import necessary library

In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import streamlit as st

sns.set(rc={'figure.figsize': [9, 9]}, font_scale=1.2)

### Step 2: Load *50k Bug Dataset* 

In [44]:
df = pd.read_csv('data/bug_dataset_50k.csv')
df

Unnamed: 0,bug_id,title,description,error_code,bug_category,bug_domain,tech_stack,severity,environment,developer_role,root_cause,suggested_fix,explanation,created_at
0,BUG_000001,API Bug detected in system,This issue relates to a api bug occurring in t...,403.0,API Bug,Mobile,Spring Boot,High,Development,Full-Stack Developer,Misconfiguration or logic issue related to api...,Review and fix the api bug according to best p...,This bug requires a full-stack developer due t...,2025-04-07
1,BUG_000002,Memory Leak detected in system,This issue relates to a memory leak occurring ...,500.0,Memory Leak,Data,GCP,Medium,Production,Backend Developer,Misconfiguration or logic issue related to mem...,Review and fix the memory leak according to be...,This bug requires a backend developer due to i...,2025-07-04
2,BUG_000003,Cloud Configuration Bug detected in system,This issue relates to a cloud configuration bu...,404.0,Cloud Configuration Bug,DevOps,Django,Medium,Production,Mobile Developer,Misconfiguration or logic issue related to clo...,Review and fix the cloud configuration bug acc...,This bug requires a mobile developer due to it...,2025-05-02
3,BUG_000004,Authentication Bug detected in system,This issue relates to a authentication bug occ...,404.0,Authentication Bug,Backend Systems,Flask,Medium,Production,Mobile Developer,Misconfiguration or logic issue related to aut...,Review and fix the authentication bug accordin...,This bug requires a mobile developer due to it...,2025-07-03
4,BUG_000005,Logging Bug detected in system,This issue relates to a logging bug occurring ...,503.0,Logging Bug,Mobile,Flask,High,Staging,Backend Developer,Misconfiguration or logic issue related to log...,Review and fix the logging bug according to be...,This bug requires a backend developer due to i...,2025-09-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,BUG_049996,CI/CD Bug detected in system,This issue relates to a ci/cd bug occurring in...,500.0,CI/CD Bug,DevOps,Vue,Critical,Staging,Security Engineer,Misconfiguration or logic issue related to ci/...,Review and fix the ci/cd bug according to best...,This bug requires a security engineer due to i...,2025-11-17
49996,BUG_049997,Memory Leak detected in system,This issue relates to a memory leak occurring ...,500.0,Memory Leak,Web Development,MySQL,High,Production,Frontend Developer,Misconfiguration or logic issue related to mem...,Review and fix the memory leak according to be...,This bug requires a frontend developer due to ...,2025-11-18
49997,BUG_049998,Database Bug detected in system,This issue relates to a database bug occurring...,403.0,Database Bug,Data,AWS,Low,Production,Data Engineer,Misconfiguration or logic issue related to dat...,Review and fix the database bug according to b...,This bug requires a data engineer due to its n...,2025-09-26
49998,BUG_049999,Authentication Bug detected in system,This issue relates to a authentication bug occ...,404.0,Authentication Bug,Cloud,AWS,High,Development,Security Engineer,Misconfiguration or logic issue related to aut...,Review and fix the authentication bug accordin...,This bug requires a security engineer due to i...,2025-05-03


### Step 3: Distinguish the features

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bug_id          50000 non-null  object 
 1   title           50000 non-null  object 
 2   description     50000 non-null  object 
 3   error_code      43812 non-null  float64
 4   bug_category    50000 non-null  object 
 5   bug_domain      50000 non-null  object 
 6   tech_stack      50000 non-null  object 
 7   severity        50000 non-null  object 
 8   environment     50000 non-null  object 
 9   developer_role  50000 non-null  object 
 10  root_cause      50000 non-null  object 
 11  suggested_fix   50000 non-null  object 
 12  explanation     50000 non-null  object 
 13  created_at      50000 non-null  object 
dtypes: float64(1), object(13)
memory usage: 5.3+ MB


### Step 4: Data Cleaning

#### Fix error_code

In [46]:
for error_code in df['error_code'].unique():
    print(error_code)
df["error_code"].isnull().sum()


403.0
500.0
404.0
503.0
502.0
401.0
nan
400.0


np.int64(6188)

In [47]:
df["error_code"] = df["error_code"].astype("Int64")

df.info()

df["error_code"].isnull().sum()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   bug_id          50000 non-null  object
 1   title           50000 non-null  object
 2   description     50000 non-null  object
 3   error_code      43812 non-null  Int64 
 4   bug_category    50000 non-null  object
 5   bug_domain      50000 non-null  object
 6   tech_stack      50000 non-null  object
 7   severity        50000 non-null  object
 8   environment     50000 non-null  object
 9   developer_role  50000 non-null  object
 10  root_cause      50000 non-null  object
 11  suggested_fix   50000 non-null  object
 12  explanation     50000 non-null  object
 13  created_at      50000 non-null  object
dtypes: Int64(1), object(13)
memory usage: 5.4+ MB


np.int64(6188)

### bug_id and created_at
### bug_id is a unique identifier and excluded from analytical features.
    Convert it only to text
### convert created_at to datetime
    sort the table on created_at

In [48]:
df['bug_id'] = df['bug_id'].astype('string')
df['description'] = df['description'].astype('string')
df['created_at'] = pd.to_datetime(df['created_at'])
df.info()

# for bug_id in df['bug_id'].unique():
#     print(bug_id)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   bug_id          50000 non-null  string        
 1   title           50000 non-null  object        
 2   description     50000 non-null  string        
 3   error_code      43812 non-null  Int64         
 4   bug_category    50000 non-null  object        
 5   bug_domain      50000 non-null  object        
 6   tech_stack      50000 non-null  object        
 7   severity        50000 non-null  object        
 8   environment     50000 non-null  object        
 9   developer_role  50000 non-null  object        
 10  root_cause      50000 non-null  object        
 11  suggested_fix   50000 non-null  object        
 12  explanation     50000 non-null  object        
 13  created_at      50000 non-null  datetime64[ns]
dtypes: Int64(1), datetime64[ns](1), object(10), string(2)


### Severity Cleaning

In [49]:
for severity in df['severity'].unique():
    print(severity)

High
Medium
Critical
Low


In [50]:
df["severity"] = (
    df["severity"]
    .astype("string")
    .str.strip()
    .str.lower()
)
for severity in df['severity'].unique():
    print(severity)

df.info()
# bug_id          50000 non-null  string        
#  1   title           50000 non-null  object        
#  2   description     50000 non-null  object        
#  3   error_code      43812 non-null  Int64         
#  4   bug_category    50000 non-null  object        
#  5   bug_domain      50000 non-null  object        
#  6   tech_stack      50000 non-null  object        
#  7   severity        50000 non-null  object        
#  8   environment     50000 non-null  object        
#  9   developer_role  50000 non-null  object        
#  10  root_cause      50000 non-null  object        
#  11  suggested_fix   50000 non-null  object        
#  12  explanation

high
medium
critical
low
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   bug_id          50000 non-null  string        
 1   title           50000 non-null  object        
 2   description     50000 non-null  string        
 3   error_code      43812 non-null  Int64         
 4   bug_category    50000 non-null  object        
 5   bug_domain      50000 non-null  object        
 6   tech_stack      50000 non-null  object        
 7   severity        50000 non-null  string        
 8   environment     50000 non-null  object        
 9   developer_role  50000 non-null  object        
 10  root_cause      50000 non-null  object        
 11  suggested_fix   50000 non-null  object        
 12  explanation     50000 non-null  object        
 13  created_at      50000 non-null  datetime64[ns]
dtypes: Int64(1), datetime64[ns](1

In [51]:
severity_map = {
    "low": "low",
    "minor": "low",
    "medium": "medium",
    "moderate": "medium",
    "high": "high",
    "critical": "critical",
    "blocker": "critical"
}

df["severity"] = df["severity"].map(severity_map)

df["severity"].value_counts()

severity
low         12628
high        12535
critical    12432
medium      12405
Name: count, dtype: int64

In [52]:
df.describe()

Unnamed: 0,error_code,created_at
count,43812.0,50000
mean,445.129394,2025-08-03 15:09:07.776000
min,400.0,2025-02-02 00:00:00
25%,401.0,2025-05-04 00:00:00
50%,404.0,2025-08-04 00:00:00
75%,502.0,2025-11-03 00:00:00
max,503.0,2026-02-02 00:00:00
std,49.398821,


## Where do most bugs fall?

In [54]:
severity_counts = df["severity"].value_counts().sort_index()
severity_counts
# st.bar_chart(severity_counts)

severity
critical    12432
high        12535
low         12628
medium      12405
Name: count, dtype: int64