In [1]:
import pandas as pd
from gensim.models import Word2Vec, KeyedVectors
from collections import Counter
from sklearn.cluster import KMeans
from gensim.utils import tokenize

In [2]:
df = pd.read_csv("~/Desktop/Making Algorithms/Assignment 1/washington_breaches.csv")

In [3]:
df

Unnamed: 0,Date Reported,Organization Name,Date of Breach,Number of Washingtonians Affected,Information Compromised
0,9/9/24,"Kemper Sports Management, LLC",4/1/24,1929,Name; Social Security Number; Driver's License...
1,8/13/24,Kootenai Health,2/22/24,94530,Name; Social Security Number; Driver's License...
2,8/5/24,QuoteWizard,5/26/24,717781,Name; Driver's License or Washington ID Card N...
3,8/3/24,Change Healthcare Inc.,2/17/24,501,Name; Social Security Number; Driver's License...
4,7/26/24,"HealthEquity, Inc.",3/9/24,90185,Name; Social Security Number; Full Date of Bir...
...,...,...,...,...,...
1357,9/18/15,"Molina Healthcare of Washington, Inc.",3/26/15,7767,Name; Medical Information; Other
1358,9/9/15,The Lifetime Healthcare Companies,12/23/13,Unknown,Unknown
1359,8/21/15,"Olympia Hotel Management, LLC",,30,Name; Financial & Banking Information
1360,8/18/15,Web.com Group,,1595,Name; Financial & Banking Information; Other


In [4]:
## How many times an organization has appeared on the data breach since it's inception 
### Seatttle HA= public housing for low-income, elderly, and disabled residents 

df['Organization Name'].value_counts()

Organization Name
Seattle Housing Authority                   3
Premera Blue Cross                          3
Rite Aid Corporation                        3
Maximus, Inc.                               3
Fred Hutchinson Cancer Center               3
                                           ..
Savannah College of Art and Design, Inc.    1
University of Washington/UW Medicine        1
CORE Cashless                               1
City of Tucson, AZ                          1
Sterling BackCheck                          1
Name: count, Length: 1302, dtype: int64

In [5]:
sha_= df[df['Organization Name'] == 'Seattle Housing Authority']
sha_

Unnamed: 0,Date Reported,Organization Name,Date of Breach,Number of Washingtonians Affected,Information Compromised
191,11/10/23,Seattle Housing Authority,8/9/23,32007,Name; Social Security Number; Financial & Bank...
1096,2/22/19,Seattle Housing Authority,1/7/19,720,Name; Financial & Banking Information
1247,4/25/17,Seattle Housing Authority,,1500,Unknown


In [6]:
Premera_Cross= df[df['Organization Name'] == 'Premera Blue Cross']
Premera_Cross

Unnamed: 0,Date Reported,Organization Name,Date of Breach,Number of Washingtonians Affected,Information Compromised
311,7/10/23,Premera Blue Cross,5/31/23,33237,Name; Full Date of Birth; Health Insurance Pol...
378,3/5/23,Premera Blue Cross,1/28/23,36211,Name; Full Date of Birth; Health Insurance Pol...
500,7/18/22,Premera Blue Cross,4/27/22,1303,Name; Health Insurance Policy or ID Number; Me...


In [7]:
## Attempt #1
#tokenizing information compromised... potential issue is with "'s" #=====df['tokens'] = df['Information Compromised'].apply(lambda x: list(tokenize(x, lower=True)))

## Attempt #2 (fixed) 
df['Tokens'] = df['Information Compromised'].apply(lambda x: x.split(";"))
df

Unnamed: 0,Date Reported,Organization Name,Date of Breach,Number of Washingtonians Affected,Information Compromised,Tokens
0,9/9/24,"Kemper Sports Management, LLC",4/1/24,1929,Name; Social Security Number; Driver's License...,"[Name, Social Security Number, Driver's Lice..."
1,8/13/24,Kootenai Health,2/22/24,94530,Name; Social Security Number; Driver's License...,"[Name, Social Security Number, Driver's Lice..."
2,8/5/24,QuoteWizard,5/26/24,717781,Name; Driver's License or Washington ID Card N...,"[Name, Driver's License or Washington ID Card..."
3,8/3/24,Change Healthcare Inc.,2/17/24,501,Name; Social Security Number; Driver's License...,"[Name, Social Security Number, Driver's Lice..."
4,7/26/24,"HealthEquity, Inc.",3/9/24,90185,Name; Social Security Number; Full Date of Bir...,"[Name, Social Security Number, Full Date of ..."
...,...,...,...,...,...,...
1357,9/18/15,"Molina Healthcare of Washington, Inc.",3/26/15,7767,Name; Medical Information; Other,"[Name, Medical Information, Other]"
1358,9/9/15,The Lifetime Healthcare Companies,12/23/13,Unknown,Unknown,[Unknown]
1359,8/21/15,"Olympia Hotel Management, LLC",,30,Name; Financial & Banking Information,"[Name, Financial & Banking Information]"
1360,8/18/15,Web.com Group,,1595,Name; Financial & Banking Information; Other,"[Name, Financial & Banking Information, Other]"


In [8]:
model = Word2Vec(df['Tokens'].values)

In [9]:
print(model)

Word2Vec<vocab=20, vector_size=100, alpha=0.025>


In [10]:
words = list(model.wv.key_to_index)
words

['Name',
 ' Full Date of Birth',
 ' Social Security Number',
 ' Other',
 ' Financial & Banking Information',
 ' Medical Information',
 " Driver's License or Washington ID Card Number",
 ' Health Insurance Policy or ID Number',
 ' Passport Number',
 ' Email Address and Password/Security Question Answers',
 ' Username and Password/Security Question Answers',
 ' Student ID Number',
 ' Military ID Number',
 'Financial & Banking Information',
 ' Biometric Data',
 ' Unique Private Key (e.g. used to authenticate or sign an electronic record)',
 'Unknown',
 'Other',
 'Social Security Number',
 'Username and Password/Security Question Answers']

In [11]:
for data in words:
    df[data] = df['Information Compromised'].apply(lambda x: 1 if data in x else 0)

In [12]:
counts_per_organization = df.groupby('Organization Name')[words].sum()
counts_per_organization

Unnamed: 0_level_0,Name,Full Date of Birth,Social Security Number,Other,Financial & Banking Information,Medical Information,Driver's License or Washington ID Card Number,Health Insurance Policy or ID Number,Passport Number,Email Address and Password/Security Question Answers,Username and Password/Security Question Answers,Student ID Number,Military ID Number,Financial & Banking Information,Biometric Data,Unique Private Key (e.g. used to authenticate or sign an electronic record),Unknown,Other,Social Security Number,Username and Password/Security Question Answers
Organization Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1st Source Bank,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
"20/20 Eye Care Network, Inc.",1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
21st Century Oncology,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0
32 Bar Blues LLC,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5th Avenue Theatre Association,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"loanDepot.com, LLC",1,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0
"mscripts, LLC",1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
prAna,1,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1
"talentReef, Inc.",1,0,1,1,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0
