# Classification Notebook

In [9]:
%pylab inline
import kagglehub
import pylab
import sklearn
import pandas as pd
from sklearn.utils import Bunch
from scipy.io import loadmat
import os

# Cybersecurity data
path = kagglehub.dataset_download("atharvasoundankar/global-cybersecurity-threats-2015-2024")

print("Path to dataset files:", path)

# Load the dataset and preview the first few rows
files = os.listdir(path)
print("Files in dataset folder:", files)
file_path = os.path.join(path, files[0])
df = pd.read_csv(file_path)
df.head()

# Clean Data and ensure no missing values
df.info()
df.isnull().sum()

# Create new security threat level, group based on attack frequency
industry_threats = df.groupby(['Year', 'Target Industry']).size().reset_index(name='AttackCount')
# to find threat levels per industry, count total number of threats per industry
# then define different level levels
def label_threat_level(count):
    if count <= 50:
        return 'Low'
    elif count <= 200:
        return 'Medium'
    else:
        return 'High'

industry_threats['ThreatLevel'] = industry_threats['AttackCount'].apply(label_threat_level)
industry_threats.head()

# Merge labeled data back into original DataFrame
df = df.merge(industry_threats[['Year', 'Target Industry', 'ThreatLevel']], on=['Year', 'Target Industry'], how='left')
df.head()


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


Path to dataset files: /kaggle/input/global-cybersecurity-threats-2015-2024
Files in dataset folder: ['Global_Cybersecurity_Threats_2015-2024.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Country                              3000 non-null   object 
 1   Year                                 3000 non-null   int64  
 2   Attack Type                          3000 non-null   object 
 3   Target Industry                      3000 non-null   object 
 4   Financial Loss (in Million $)        3000 non-null   float64
 5   Number of Affected Users             3000 non-null   int64  
 6   Attack Source                        3000 non-null   object 
 7   Security Vulnerability Type          3000 non-null   object 
 8   Defense Mechanism Used               3000 non-null   object 
 9   Incident Resol

Unnamed: 0,Country,Year,Attack Type,Target Industry,Financial Loss (in Million $),Number of Affected Users,Attack Source,Security Vulnerability Type,Defense Mechanism Used,Incident Resolution Time (in Hours),ThreatLevel
0,China,2019,Phishing,Education,80.53,773169,Hacker Group,Unpatched Software,VPN,63,Low
1,China,2019,Ransomware,Retail,62.19,295961,Hacker Group,Unpatched Software,Firewall,71,Low
2,India,2017,Man-in-the-Middle,IT,38.65,605895,Hacker Group,Weak Passwords,VPN,20,Medium
3,UK,2024,Ransomware,Telecommunications,41.44,659320,Nation-state,Social Engineering,AI-based Detection,7,Low
4,Germany,2018,Man-in-the-Middle,IT,74.41,810682,Insider,Social Engineering,VPN,68,Low
