<a href="https://colab.research.google.com/github/ZulfiiaDitto/BalancingStrategiesForImageDataset/blob/main/data_breach_investigation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The goal of the notebook is to analyze currently under investigation data breaches for the last 24 months.

Data is publically available at https://ocrportal.hhs.gov/ocr/breach/breach_report.jsf.

Data had been manually loaded as a CSV file on 11/5/2024.


In [1]:
%cd "/content/drive/MyDrive/Colab Notebooks/data breach analysis"

/content/drive/MyDrive/Colab Notebooks/data breach analysis


In [2]:
import pandas as pd
import plotly.express as px

breach = pd.read_csv("breach_report.csv")
breach.head()


Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description
0,"Potomac Medical Aesthetics, LLC",MD,Healthcare Provider,2876,11/01/2024,Unauthorized Access/Disclosure,Email,No,
1,Regence BlueCross BlueShield,OR,Health Plan,610,10/29/2024,Unauthorized Access/Disclosure,Paper/Films,Yes,
2,Family Medical Center,MD,Healthcare Provider,2100,10/29/2024,Hacking/IT Incident,Network Server,No,
3,BrightStar Care,IL,Healthcare Provider,1187,10/28/2024,Hacking/IT Incident,Email,No,
4,Mystic Valley Elder Services,MA,Healthcare Provider,85133,10/28/2024,Hacking/IT Incident,Network Server,No,


In [3]:
breach.shape
# we have 857 data breach accidents past 24 months

(857, 9)

In [4]:
breach['Individuals Affected'].sum()
# where 276,130,221 patients had been affected

276130221

In [5]:
breach['Type of Breach'].value_counts()
# most often type of breach is hacking/IT incident

Unnamed: 0_level_0,count
Type of Breach,Unnamed: 1_level_1
Hacking/IT Incident,745
Unauthorized Access/Disclosure,94
Theft,10
Loss,5
Improper Disposal,3


In [6]:
# count of different data breach type
type_breach = breach['Type of Breach'].value_counts().sort_values(ascending=False).reset_index()
type_breach.columns = ['Type of Breach', 'count']
fig = px.bar(type_breach, y='Type of Breach', x='count',
             title="Count of the different breach types")
fig.update_layout(title_x=0.5)
fig.update_layout(yaxis_title=None)
fig.update_layout(width=800, height=600)
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
fig.update_traces(text=type_breach['count'], textposition='outside')
fig.show()

In [7]:
# what is the most common type of covered entity with data breach
covered_entity_counts = breach['Covered Entity Type'].value_counts().reset_index()
covered_entity_counts.columns = ['Covered Entity Type', 'Count']  # Rename columns for clarity

# Create the pie chart
fig = px.pie(covered_entity_counts,
             names='Covered Entity Type',
             values='Count',
             title='Distribution of Covered Entity Types')

fig.update_layout(title_x=0.5)
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
fig.show()
# most often covered entity is Healthcare Provider

In [8]:
# lets put the count of affected individuals on the State map

affected_by_state = breach.groupby('State', as_index = False)['Individuals Affected'].sum()

affected_by_state = affected_by_state.sort_values(by='Individuals Affected', ascending=False)
# Create a choropleth map using Plotly
fig = px.choropleth(affected_by_state,
                    locations='State',
                    locationmode='USA-states',
                    color='Individuals Affected',
                    hover_name='State',
                    hover_data=['Individuals Affected'],
                    color_continuous_scale='Viridis',
                    title='Affected Individuals  by State', scope='usa')

fig.update_layout(title_x=0.5)
fig.show()
# we see that MN is affected the most past 24 months

In [9]:
# lest deep dive

breach_mn = breach[breach['State'] == 'MN']
breach_mn.groupby(['Name of Covered Entity'])['Individuals Affected'].sum().sort_values(ascending=False)
# you can see that Change Helathcare, INC had one of the biggest data breach past 24 months

Unnamed: 0_level_0,Individuals Affected
Name of Covered Entity,Unnamed: 1_level_1
"Change Healthcare, Inc.",100000000
"MNGI Digestive Health, PA",767670
Radius Global Solutions,632204
Consulting Radiologists LTD.,583824
"PDG, P.A. dba Park Dental",238667
Clay County Social Services,123807
CCM Health,84329
Fraser Child and Family Center,64131
"Dental Specialists of Minnesota, PLLC dba The Dental Specialists",38442
Children's Health Care,24183


In [10]:
# lets see the timeline sum of affected individuals on month-year scale

breach['Breach Submission Date'] = pd.to_datetime(breach['Breach Submission Date'])
breach['year-month'] = breach['Breach Submission Date'].dt.strftime('%Y-%m')

In [11]:
affected_ind = breach.groupby('year-month', as_index = False)['Individuals Affected'].sum()

fig = px.line(affected_ind, x='year-month', y='Individuals Affected',
              title='Sum of Affected Individuals by month-year',
              labels={'year-month': 'Year-Month', 'Individuals Affected': 'Total Individuals Affected'})
fig.update_layout(title_x=0.5)
fig.update_layout(width=800, height=600)
fig.show()

In [12]:
affected_ind_average = breach.groupby('year-month', as_index = False)['Individuals Affected'].mean()
affected_ind_average

Unnamed: 0,year-month,Individuals Affected
0,2022-03,34775.0
1,2022-05,1868831.0
2,2022-06,22011.0
3,2022-09,19020.0
4,2022-10,25536.0
5,2022-11,412828.5
6,2022-12,132140.8
7,2023-01,22069.38
8,2023-02,454263.3
9,2023-03,120231.4


In [13]:
fig = px.line(affected_ind_average, x='year-month', y='Individuals Affected',
              title='Average of Affected Individuals by month-year',
              labels={'year-month': 'Year-Month', 'Individuals Affected': 'Average Individuals Affected'})
fig.update_layout(title_x=0.5)
fig.update_layout(width=800, height=600)
fig.show()

In [14]:
breach['Individuals Affected'].mean()

322205.62543757295

In [15]:
# lets see the count of the incidents by month-year

incidents_count = breach.groupby('year-month', as_index = False)['Name of Covered Entity'].count()
incidents_count.columns = ['year-month', 'Incidents Count']
incidents_count
fig = px.line(incidents_count, x='year-month', y='Incidents Count',
              title='Count of Data Breach Incidents per Year-Month',
              labels={'year-month': 'Year-Month', 'Incidents Count': 'Total Incidents Count'})
fig.update_layout(title_x=0.5)
fig.update_layout(width=800, height=600)
fig.show()

# we can see that there is definitly increase in data breaches starting January 2023

In [20]:
# we already know the most common type of breach is Hacking/IT incident
# but lets see if we can see pattern past 24 months

incidents_count = breach.groupby('year-month', as_index = False)['Type of Breach'].value_counts()

fig = px.line(incidents_count, x='year-month', y='count',
              color='Type of Breach',
              title='Timeline of Breach Types Over Time',
              labels={'year-month': 'Year-Month', 'Count': 'Number of Incidents'},
              line_shape='linear')

fig.update_layout(
    legend=dict(
        x=0,
        y=1,
        traceorder='normal',
        orientation='v',
        xanchor='left',
        yanchor='top',
        bordercolor='black',  # Black outline for the legend
        borderwidth=1
    )
)

fig.update_layout(title_x=0.5)
fig.update_layout(width=800, height=600)
fig.show()
# as we can see the data breach increase in Jan 2023 is result
# of increase in haacking/IT incidents




In [17]:
# Lets see the most common locations of data breach

location = breach['Location of Breached Information'].value_counts().nlargest(5).reset_index()
location.columns = ['Location of Breached Information', 'Count']

fig = px.bar(location, y='Location of Breached Information', x='Count',
             title="Five most common breach location for past 24 months")
fig.update_layout(title_x=0.5)
fig.update_layout(yaxis_title=None)
fig.update_layout(width=800, height=600)
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
fig.update_traces(text=location['Count'], textposition='outside')
fig.show()

In [26]:
incidents_count = breach.loc[breach['Location of Breached Information'].isin(['Network Server', 'Email', 'Paper/Films',
                                                                              "Electronic Medical Record"])].groupby('year-month', as_index = False)['Location of Breached Information'].value_counts()

fig = px.line(incidents_count, x='year-month', y='count',
              color='Location of Breached Information',
              title='Count of Breach Location Over Time',
              labels={'year-month': 'Year-Month', 'Count': 'Location of Breached Informations'},
              line_shape='linear')

fig.update_layout(
    legend=dict(
        x=1.10,
        y=1,
        traceorder='normal',
        orientation='v',
        xanchor='left',
        yanchor='top',
        bordercolor='black',  # Black outline for the legend
        borderwidth=1
    )
)

fig.update_layout(title_x=0.5)
fig.update_layout(width=1000, height=600)
fig.show()



