In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("cybersecurity_attacks.csv")
df.head()

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Proxy Information,Firewall Logs,IDS/IPS Alerts,Log Source
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",150.9.97.135,Log Data,,Server
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",,Log Data,,Firewall
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",114.133.48.179,Log Data,Alert Data,Firewall
3,2023-07-02 10:38:46,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,Blocked,Medium,Fateh Kibe,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ...,Segment B,"Jaunpur, Rajasthan",,,Alert Data,Firewall
4,2023-07-16 13:11:07,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,Blocked,Low,Dhanush Chad,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...,Segment C,"Anantapur, Tripura",149.6.110.119,,Alert Data,Firewall


In [3]:
df.isnull().sum().sort_values(ascending=False)

IDS/IPS Alerts            20050
Malware Indicators        20000
Firewall Logs             19961
Proxy Information         19851
Attack Type                   0
Geo-location Data             0
Network Segment               0
Device Information            0
User Information              0
Severity Level                0
Action Taken                  0
Attack Signature              0
Timestamp                     0
Source IP Address             0
Anomaly Scores                0
Payload Data                  0
Traffic Type                  0
Packet Type                   0
Packet Length                 0
Protocol                      0
Destination Port              0
Source Port                   0
Destination IP Address        0
Log Source                    0
dtype: int64

In [4]:
# Count the total number of rows in the DataFrame, which represent the number of attacks
num_attacks = len(df)

num_attacks

40000

In [5]:
# I'll first check if 'Geo-location Data' column contains information about countries.
df['Geo-location Data'].unique()[:10]

array(['Jamshedpur, Sikkim', 'Bilaspur, Nagaland', 'Bokaro, Rajasthan',
       'Jaunpur, Rajasthan', 'Anantapur, Tripura',
       'Aurangabad, Meghalaya', 'Eluru, Manipur',
       'Phagwara, Andhra Pradesh', 'Ambala, Tripura', 'Rampur, Mizoram'],
      dtype=object)

In [6]:
# Let's use the 'Geo-location Data' column to figure out where most attacks occur.
# Assuming this column contains country data, I will count the value occurrences.

most_attacks_country = df['Geo-location Data'].value_counts().idxmax()

most_attacks_country

'Ghaziabad, Meghalaya'

In [7]:
# using the "Attack Type" we can get the different types of attacks that happened
attack_types = df["Attack Type"].unique()
attack_types

array(['Malware', 'DDoS', 'Intrusion'], dtype=object)

In [8]:
# Checking unique values in 'Alerts/Warnings' column to understand the format of data
df['Alerts/Warnings'].unique()[:10]

array([nan, 'Alert Triggered'], dtype=object)

In [9]:
# Checking if 'User Information' has any relation with 'Alerts/Warnings'
df[df['Alerts/Warnings'].notna()]['User Information'].unique()[:10]

array(['Himmat Karpe', 'Fateh Kibe', 'Dhanush Chad', 'Yuvaan Dubey',
       'Zaina Iyer', 'Mishti Chaudhuri', 'Hunar Sem', 'Vaibhav Kala',
       'Inaaya  Soman', 'Shaan Subramaniam'], dtype=object)

In [10]:
# Check how many alerts were triggered for each severity level
alert_severity = df[df['Alerts/Warnings'] == 'Alert Triggered']['Severity Level'].value_counts()

alert_severity

Severity Level
Medium    6682
High      6682
Low       6569
Name: count, dtype: int64

In [11]:
# Check how many alerts were triggered for each protocol
alert_protocol = df[df['Alerts/Warnings'] == 'Alert Triggered']['Protocol'].value_counts()

alert_protocol

Protocol
ICMP    6728
UDP     6663
TCP     6542
Name: count, dtype: int64

In [12]:
# Checking the basic statistics for 'Packet Length' when alerts are triggered
alert_packet_length = df[df['Alerts/Warnings'] == 'Alert Triggered']['Packet Length'].describe()

alert_packet_length

count    19933.000000
mean       778.208699
std        415.710721
min         64.000000
25%        417.000000
50%        778.000000
75%       1138.000000
max       1500.000000
Name: Packet Length, dtype: float64

In [13]:
# Assuming 'Traffic Type' column contains the traffic types
# Let's calculate the occurrence of each traffic type under attack incidents

traffic_types_attacked = df['Traffic Type'].value_counts()

traffic_types_attacked

Traffic Type
DNS     13376
HTTP    13360
FTP     13264
Name: count, dtype: int64

In [14]:
# Convert 'Timestamp' to datetime object
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Hour'] = df['Timestamp'].dt.hour

# Find the hour during which most attacks occur
most_attacks_hour = df['Hour'].value_counts().idxmax()

most_attacks_hour

13