# Mid-Project Presented By Amany Arafa
# Database used is ***50k Bug Dataset***

### Step 1: Import necessary library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set(rc={'figure.figsize': [9, 9]}, font_scale=1.2)

### Step 2: Load *50k Bug Dataset* 

In [2]:
df = pd.read_csv('data/bug_dataset_50k.csv')
df

Unnamed: 0,bug_id,title,description,error_code,bug_category,bug_domain,tech_stack,severity,environment,developer_role,root_cause,suggested_fix,explanation,created_at
0,BUG_000001,API Bug detected in system,This issue relates to a api bug occurring in t...,403.0,API Bug,Mobile,Spring Boot,High,Development,Full-Stack Developer,Misconfiguration or logic issue related to api...,Review and fix the api bug according to best p...,This bug requires a full-stack developer due t...,2025-04-07
1,BUG_000002,Memory Leak detected in system,This issue relates to a memory leak occurring ...,500.0,Memory Leak,Data,GCP,Medium,Production,Backend Developer,Misconfiguration or logic issue related to mem...,Review and fix the memory leak according to be...,This bug requires a backend developer due to i...,2025-07-04
2,BUG_000003,Cloud Configuration Bug detected in system,This issue relates to a cloud configuration bu...,404.0,Cloud Configuration Bug,DevOps,Django,Medium,Production,Mobile Developer,Misconfiguration or logic issue related to clo...,Review and fix the cloud configuration bug acc...,This bug requires a mobile developer due to it...,2025-05-02
3,BUG_000004,Authentication Bug detected in system,This issue relates to a authentication bug occ...,404.0,Authentication Bug,Backend Systems,Flask,Medium,Production,Mobile Developer,Misconfiguration or logic issue related to aut...,Review and fix the authentication bug accordin...,This bug requires a mobile developer due to it...,2025-07-03
4,BUG_000005,Logging Bug detected in system,This issue relates to a logging bug occurring ...,503.0,Logging Bug,Mobile,Flask,High,Staging,Backend Developer,Misconfiguration or logic issue related to log...,Review and fix the logging bug according to be...,This bug requires a backend developer due to i...,2025-09-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,BUG_049996,CI/CD Bug detected in system,This issue relates to a ci/cd bug occurring in...,500.0,CI/CD Bug,DevOps,Vue,Critical,Staging,Security Engineer,Misconfiguration or logic issue related to ci/...,Review and fix the ci/cd bug according to best...,This bug requires a security engineer due to i...,2025-11-17
49996,BUG_049997,Memory Leak detected in system,This issue relates to a memory leak occurring ...,500.0,Memory Leak,Web Development,MySQL,High,Production,Frontend Developer,Misconfiguration or logic issue related to mem...,Review and fix the memory leak according to be...,This bug requires a frontend developer due to ...,2025-11-18
49997,BUG_049998,Database Bug detected in system,This issue relates to a database bug occurring...,403.0,Database Bug,Data,AWS,Low,Production,Data Engineer,Misconfiguration or logic issue related to dat...,Review and fix the database bug according to b...,This bug requires a data engineer due to its n...,2025-09-26
49998,BUG_049999,Authentication Bug detected in system,This issue relates to a authentication bug occ...,404.0,Authentication Bug,Cloud,AWS,High,Development,Security Engineer,Misconfiguration or logic issue related to aut...,Review and fix the authentication bug accordin...,This bug requires a security engineer due to i...,2025-05-03


### Step 3: Distinguish the features

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bug_id          50000 non-null  object 
 1   title           50000 non-null  object 
 2   description     50000 non-null  object 
 3   error_code      43812 non-null  float64
 4   bug_category    50000 non-null  object 
 5   bug_domain      50000 non-null  object 
 6   tech_stack      50000 non-null  object 
 7   severity        50000 non-null  object 
 8   environment     50000 non-null  object 
 9   developer_role  50000 non-null  object 
 10  root_cause      50000 non-null  object 
 11  suggested_fix   50000 non-null  object 
 12  explanation     50000 non-null  object 
 13  created_at      50000 non-null  object 
dtypes: float64(1), object(13)
memory usage: 5.3+ MB


### Step 4: Data Cleaning

#### Fix error_code

In [4]:
for error_code in df['error_code'].unique():
    print(error_code)

403.0
500.0
404.0
503.0
502.0
401.0
nan
400.0


### bug_id
### bug_id is a unique identifier and excluded from analytical features.
### Convert it only to text

In [None]:
df["bug_id"] = df["bug_id"].astype("string")
for bug_id in df['bug_id'].unique():
    print(bug_id)

BUG_000001
BUG_000002
BUG_000003
BUG_000004
BUG_000005
BUG_000006
BUG_000007
BUG_000008
BUG_000009
BUG_000010
BUG_000011
BUG_000012
BUG_000013
BUG_000014
BUG_000015
BUG_000016
BUG_000017
BUG_000018
BUG_000019
BUG_000020
BUG_000021
BUG_000022
BUG_000023
BUG_000024
BUG_000025
BUG_000026
BUG_000027
BUG_000028
BUG_000029
BUG_000030
BUG_000031
BUG_000032
BUG_000033
BUG_000034
BUG_000035
BUG_000036
BUG_000037
BUG_000038
BUG_000039
BUG_000040
BUG_000041
BUG_000042
BUG_000043
BUG_000044
BUG_000045
BUG_000046
BUG_000047
BUG_000048
BUG_000049
BUG_000050
BUG_000051
BUG_000052
BUG_000053
BUG_000054
BUG_000055
BUG_000056
BUG_000057
BUG_000058
BUG_000059
BUG_000060
BUG_000061
BUG_000062
BUG_000063
BUG_000064
BUG_000065
BUG_000066
BUG_000067
BUG_000068
BUG_000069
BUG_000070
BUG_000071
BUG_000072
BUG_000073
BUG_000074
BUG_000075
BUG_000076
BUG_000077
BUG_000078
BUG_000079
BUG_000080
BUG_000081
BUG_000082
BUG_000083
BUG_000084
BUG_000085
BUG_000086
BUG_000087
BUG_000088
BUG_000089
BUG_000090
BUG_000091