### Preamble
Scrapy Packet Manipulation

In [80]:
from scapy.all import sniff
from scapy.all import rdpcap
import plotly.express as px
import plotly.graph_objects as go

In [24]:
packets = sniff(count=25)

print(type(packets))
print(f'Found {len(packets)} packets.')
print(packets)

<class 'scapy.plist.PacketList'>
Found 25 packets.
<Sniffed: TCP:0 UDP:25 ICMP:0 Other:0>


In [25]:
first_packet = packets[0]
print(type(first_packet))

<class 'scapy.layers.l2.Ether'>


In [35]:
for packet in packets[:5]:
    print("\n" + "=" * 50)
    print(f"Packet {packets.index(packet) + 1}:")
    print("=" * 50)
    packet.show()


Packet 1:
###[ Ethernet ]###
  dst       = 0e:12:f6:8c:ab:5e
  src       = 20:f3:75:b3:81:c0
  type      = IPv4
###[ IP ]###
     version   = 4
     ihl       = 5
     tos       = 0x0
     len       = 1210
     id        = 26874
     flags     = DF
     frag      = 0
     ttl       = 50
     proto     = udp
     chksum    = 0xf9f3
     src       = 66.22.222.125
     dst       = 192.168.0.9
     \options   \
###[ UDP ]###
        sport     = 50024
        dport     = 61905
        len       = 1190
        chksum    = 0xb75b
###[ Raw ]###
           load      = b'\x90g<}\xd1\xbb\xe3\x87\x00\x02\x84*\xbe\xde\x00\x03\xbfd\x8a\x89\xbf\xea\x87\xff\x0c\xe0\xca\x87_\x91\x81\x87\xd6\xcc\x8b\x0fV\xc9\xf8\xb5\x80\xea\xbd\t\xd0\xc3\xa0-j+\xd4X\x01=H;\xeb\x1f\xba\xaff\x0c\x1b\x88\r\xb7\xba\xab\xb5\xbdS\xf8h\xd7\x14^\xf2@;6\xeb\xbe\xde$\x87p\x00\x003\xc6\x89\x8d\x9f|h? G8\xc8+q\xaf\xea\xbc\xd0\x97\x02\xf8t\xb1\xeaL\x8d\xe4\x8bu<~M{\xfc\x1c\xc0ky\x13\xfc\x13\x1b\x0c\x8f\xf4\xe8\xb0\xcb\x0b\xc2\xc0]\

### Statistical Analysis

In [None]:
packets = rdpcap('packets.pcap')

print(f'Found {len(packets)} packets.')

Found 62 packets.


In [102]:
import pandas as pd
from scapy.all import rdpcap, IP, TCP, UDP

packets = rdpcap('packets.pcap')

packet_rows = []

for packet in packets:
    try:
        if packet.haslayer(IP):
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst
            
            if packet.haslayer(TCP):
                src_port = packet[TCP].sport
                dst_port = packet[TCP].dport
                protocol = "TCP"
                payload = bytes(packet[TCP].payload) if packet[TCP].payload else b''
            elif packet.haslayer(UDP):
                src_port = packet[UDP].sport
                dst_port = packet[UDP].dport
                protocol = "UDP"
                payload = bytes(packet[UDP].payload) if packet[UDP].payload else b''
            else:
                src_port = None
                dst_port = None
                protocol = f"Other ({packet[IP].proto})"
                payload = b''
            
            payload_length = len(packet)
            timestamp = float(packet.time)
            
            packet_rows.append({
                'src_ip': src_ip,
                'dst_ip': dst_ip,
                'src_port': src_port, 
                'dst_port': dst_port,
                'protocol': protocol,
                'payload_length': payload_length,
                'payload': payload,  
                'timestamp': timestamp
            })
    except Exception as e:
        print(f"Skipping packet due to error: {e}")
        continue

packet_data = pd.DataFrame(packet_rows)
packet_data.head()

Unnamed: 0,src_ip,dst_ip,src_port,dst_port,protocol,payload_length,payload,timestamp
0,10.1.10.53,84.54.22.33,53,53,UDP,975,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
1,84.54.22.33,10.1.10.53,53,53,UDP,98,b'\x00\x0c\x81\x80\x00\x01\x00\x01\x00\x00\x00...,1532199000.0
2,10.1.10.53,84.54.22.33,53,53,UDP,989,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
3,84.54.22.33,10.1.10.53,53,53,UDP,98,b'\x00\x0c\x81\x80\x00\x01\x00\x01\x00\x00\x00...,1532199000.0
4,10.1.10.53,84.54.22.33,53,53,UDP,1026,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0


In [74]:
import json
print(json.dumps(json.loads(packet_data.iloc[0].to_json()), indent=2))


{
  "src_ip": "10.1.10.53",
  "dst_ip": "84.54.22.33",
  "src_port": 53,
  "dst_port": 53,
  "protocol": "UDP",
  "payload_length": 975,
  "timestamp": 1532199330.917674
}


#### Statistics

In [75]:
print('Unique source IPs: \n', packet_data['src_ip'].unique())
print('Unique destination IPs: \n', packet_data['dst_ip'].unique())

Unique source IPs: 
 ['10.1.10.53' '84.54.22.33' '75.75.75.75']
Unique destination IPs: 
 ['84.54.22.33' '10.1.10.53' '75.75.75.75']


In [76]:
src_ip_frequency = packet_data['src_ip'].value_counts()
dst_ip_frequency = packet_data['dst_ip'].value_counts()

frequency_df = pd.DataFrame({
    'src_ip': src_ip_frequency,
    'dst_ip': dst_ip_frequency
})

frequency_df

Unnamed: 0,src_ip,dst_ip
10.1.10.53,31,31
84.54.22.33,29,29
75.75.75.75,2,2


In [78]:
most_frequent_src_ip = src_ip_frequency.idxmax()

frequent_src_records = packet_data[packet_data['src_ip'] == most_frequent_src_ip]
print('Most frequent source IP:', most_frequent_src_ip)

most_frequent_dst_ip = frequent_src_records['dst_ip'].value_counts().idxmax()
print(f'Most frequent destination IP for {most_frequent_src_ip}: {most_frequent_dst_ip}')

dest_ports = frequent_src_records['dst_port'].unique()
print(f'Destination Ports from {most_frequent_src_ip}: {dest_ports}')

origin_ports = frequent_src_records['src_port'].unique()
print(f'Origin ports from {most_frequent_dst_ip}: {origin_ports}')

Most frequent source IP: 10.1.10.53
Most frequent destination IP for 10.1.10.53: 84.54.22.33
Destination Ports from 10.1.10.53: [53]
Origin ports from 84.54.22.33: [   53 15812 23903]


El puerto más común es el `53`. 

Dicho puerto se utiliza para servicios del Sistema de Nombres de Dominio (DNS). DNS es un protocolo que traduce nombres de dominio legibles por humanos (como example.com) en direcciones IP que las computadoras utilizan para identificarse entre sí en la red.

Es probable que se trate de un servidor DNS que maneja solicitudes de resolución de nombres. Este patrón es típico del tráfico DNS donde los clientes envían solicitudes a un servidor DNS, y el servidor envía respuestas de vuelta.

Source IP vs Total Payload

In [87]:
src_ip_payload = packet_data.groupby('src_ip')['payload_length'].sum().reset_index()
src_ip_payload = src_ip_payload.sort_values('payload_length', ascending=True)

fig_a = px.bar(
    src_ip_payload, 
    x='payload_length', 
    y='src_ip', 
    orientation='h',
    title='Total Payload Sent by Source IP',
    labels={'payload_length': 'Total Payload (bytes)', 'src_ip': 'Source IP'},
    color='payload_length',
    color_continuous_scale='Viridis'
)
fig_a.update_layout(height=500, width=800)
fig_a.show()

Destination IP vs Total Payload

In [88]:
dst_ip_payload = packet_data.groupby('dst_ip')['payload_length'].sum().reset_index()
dst_ip_payload = dst_ip_payload.sort_values('payload_length', ascending=True)

fig_b = px.bar(
    dst_ip_payload, 
    x='payload_length', 
    y='dst_ip', 
    orientation='h',
    title='Total Payload Received by Destination IP',
    labels={'payload_length': 'Total Payload (bytes)', 'dst_ip': 'Destination IP'},
    color='payload_length',
    color_continuous_scale='Viridis'
)
fig_b.update_layout(height=500, width=800)
fig_b.show() 

Source Port vs Total Payload

In [90]:
src_port_payload = packet_data.groupby('src_port')['payload_length'].sum().reset_index()
src_port_payload = src_port_payload.sort_values('payload_length', ascending=True)
src_port_payload['src_port'] = src_port_payload['src_port'].astype(str)  # Convert ports to strings for better display

fig_c = px.bar(
    src_port_payload, 
    x='payload_length', 
    y='src_port', 
    orientation='h',
    title='Total Payload Sent by Source Port',
    labels={'payload_length': 'Total Payload (bytes)', 'src_port': 'Source Port'},
    color='payload_length',
    color_continuous_scale='Viridis'
)
fig_c.update_layout(height=600, width=800)
fig_c.show()  # This displays the plot in the noteboo

Destination Port vs Total Payload

In [91]:
dst_port_payload = packet_data.groupby('dst_port')['payload_length'].sum().reset_index()
dst_port_payload = dst_port_payload.sort_values('payload_length', ascending=True)
dst_port_payload['dst_port'] = dst_port_payload['dst_port'].astype(str)  # Convert ports to strings for better display

fig_d = px.bar(
    dst_port_payload, 
    x='payload_length', 
    y='dst_port', 
    orientation='h',
    title='Total Payload Received by Destination Port',
    labels={'payload_length': 'Total Payload (bytes)', 'dst_port': 'Destination Port'},
    color='payload_length',
    color_continuous_scale='Viridis'
)
fig_d.update_layout(height=600, width=800)
fig_d.show()

e. Time vs Payload for most frequent source IP

In [None]:
src_ip_counts = packet_data['src_ip'].value_counts()
most_frequent_src_ip = src_ip_counts.index[0]

most_freq_ip_data = packet_data[packet_data['src_ip'] == most_frequent_src_ip].copy()
most_freq_ip_data['time'] = pd.to_datetime(most_freq_ip_data['timestamp'], unit='s')

time_span = (most_freq_ip_data['timestamp'].max() - most_freq_ip_data['timestamp'].min())
if time_span > 3600: 
    freq = '10min'  
elif time_span > 600:  
    freq = '1min'
elif time_span > 60:  
    freq = '10s'
else:
    freq = '1s'

print(f'Frequency used: {freq}')

most_freq_ip_data['time_group'] = most_freq_ip_data['time'].dt.floor(freq)

time_grouped_data = most_freq_ip_data.groupby('time_group')['payload_length'].sum().reset_index()

fig_e = px.bar(
    time_grouped_data, 
    x='time_group', 
    y='payload_length',
    title=f'Payload over Time for Most Frequent Source IP ({most_frequent_src_ip})',
    labels={'payload_length': 'Total Payload (bytes)', 'time_group': 'Time'},
    color='payload_length',
    color_continuous_scale='Viridis'
)

fig_e.update_layout(
    height=500, 
    width=1000,
    xaxis_title="Time",
    yaxis_title="Total Payload (bytes)",
    bargap=0.05  
)

fig_e.update_xaxes(
    tickformat="%H:%M:%S",
    tickangle=-45
)

fig_e.show()


Frequency used: 1s


El análisis del tráfico de red revela que la dirección IP `10.1.10.53` está enviando una gran cantidad de datos, aproximadamente 30 mil bytes, hacia la IP `84.54.22.33`, utilizando principalmente el puerto 53, que es típico para el tráfico DNS. Este volumen de datos, junto con un pico de 12,223 bytes en un solo segundo a las 18:55:41, resulta inusual para consultas DNS estándar, las cuales normalmente involucran paquetes pequeños de pocos cientos de bytes. 

La comunicación constante de 800 a 1000 bytes por segundo, combinada con este pico, sugiere que podría estar ocurriendo algo más allá de simples consultas DNS, como un posible caso de DNS tunneling, una técnica que codifica datos en solicitudes DNS para transferir información de manera encubierta. Este comportamiento no es común para el tráfico DNS típico y podría indicar una actividad maliciosa, como la exfiltración de datos o una comunicación con un servidor de comando y control. 



In [103]:
# a. DF with only connections from most frequent source IP
src_ip_frequency = packet_data['src_ip'].value_counts()
most_frequent_src_ip = src_ip_frequency.idxmax()  # 10.1.10.53
df_most_frequent_ip = packet_data[packet_data['src_ip'] == most_frequent_src_ip]

df_most_frequent_ip.head()


Unnamed: 0,src_ip,dst_ip,src_port,dst_port,protocol,payload_length,payload,timestamp
0,10.1.10.53,84.54.22.33,53,53,UDP,975,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
2,10.1.10.53,84.54.22.33,53,53,UDP,989,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
4,10.1.10.53,84.54.22.33,53,53,UDP,1026,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
6,10.1.10.53,84.54.22.33,53,53,UDP,1012,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
8,10.1.10.53,84.54.22.33,53,53,UDP,1017,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0


In [104]:
# b. New DF with columns src, dst and sum of payload grouped by dst
df_grouped = df_most_frequent_ip[['src_ip', 'dst_ip', 'payload', 'payload_length']].rename(
    columns={'src_ip': 'src', 'dst_ip': 'dst'}
).groupby('dst').agg({'src': 'first', 'payload_length': 'sum', 'payload': list})


df_grouped.head()


Unnamed: 0_level_0,src,payload_length,payload
dst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75.75.75.75,10.1.10.53,158,[b'\xda\xd1\x01\x00\x00\x01\x00\x00\x00\x00\x0...
84.54.22.33,10.1.10.53,29197,[b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x0...


In [107]:

# c. Get the destination IP with most exchanged bytes with the most frequent IP
suspicious_ip = df_grouped['payload_length'].idxmax()  # 84.54.22.33

# d. Create DF with the conversation between the most frequent IP and the suspicious IP
conversation_df = packet_data[
    ((packet_data['src_ip'] == most_frequent_src_ip) & (packet_data['dst_ip'] == suspicious_ip)) |
    ((packet_data['src_ip'] == suspicious_ip) & (packet_data['dst_ip'] == most_frequent_src_ip))
].sort_values('timestamp')


conversation_df.head()


Unnamed: 0,src_ip,dst_ip,src_port,dst_port,protocol,payload_length,payload,timestamp
0,10.1.10.53,84.54.22.33,53,53,UDP,975,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
1,84.54.22.33,10.1.10.53,53,53,UDP,98,b'\x00\x0c\x81\x80\x00\x01\x00\x01\x00\x00\x00...,1532199000.0
2,10.1.10.53,84.54.22.33,53,53,UDP,989,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0
3,84.54.22.33,10.1.10.53,53,53,UDP,98,b'\x00\x0c\x81\x80\x00\x01\x00\x01\x00\x00\x00...,1532199000.0
4,10.1.10.53,84.54.22.33,53,53,UDP,1026,b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00...,1532199000.0


In [118]:
# e. Get the payloads from the conversation DF and add them to an array
payloads_array = conversation_df['payload'].tolist()

# f. Show the content of the array
for i, payload in enumerate(payloads_array[:5]):  # Show first 5 for brevity
    print(f"Packet {i+1} payload prefix: {payload[:50]}")  # Show first 50 bytes

Packet 1 payload prefix: b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00\x00\x06google\x03com\x00\x00\x1c\x00\x01\xef\xbf\xbdPNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01b'
Packet 2 payload prefix: b'\x00\x0c\x81\x80\x00\x01\x00\x01\x00\x00\x00\x00\x06google\x03com\x00\x00\x1c\x00\x01\xc0\x0c\x00\x1c\x00\x01\x00\x00\x01+\x00\x10&\x07\xf8\xb0@\x05\x08\x07\x00\x00'
Packet 3 payload prefix: b'\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00\x00\x06google\x03com\x00\x00\x1c\x00\x01:\xef\xbf\xbdle:\xc7\xa9\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\x0c\xef\xbf\xbd'
Packet 4 payload prefix: b'\x00\x0c\x81\x80\x00\x01\x00\x01\x00\x00\x00\x00\x06google\x03com\x00\x00\x1c\x00\x01\xc0\x0c\x00\x1c\x00\x01\x00\x00\x01)\x00\x10&\x07\xf8\xb0@\x05\x08\x07\x00\x00'
Packet 5 payload prefix: b"\x00\x0c\x01\x00\x00\x01\x00\x00\x00\x00\x00\x00\x06google\x03com\x00\x00\x1c\x00\x01\xef\xbf\xbd\xef\xbf\xbd^n\xef\xbf\xbd''\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"


In [117]:
def analyze_dns_payload(payload, packet_num):
    analysis = []
    suspicion_score = 0
    
    is_dns_query = payload.startswith(b'\x00\x0c\x01\x00')
    is_dns_response = payload.startswith(b'\x00\x0c\x81\x80')
    
    if is_dns_query:
        analysis.append(f"DNS Query for 'google.com'")
    elif is_dns_response:
        analysis.append(f"DNS Response for 'google.com'")
    else:
        analysis.append("Not a standard DNS packet")
        suspicion_score += 2
    
    signatures = {
        b'PNG\r\n\x1a\n': "PNG image file",
        b'JFIF': "JPEG image file",
        b'PK\x03\x04': "ZIP archive",
        b'%PDF': "PDF document",
        b'MZ': "Windows executable",
        b'GIF8': "GIF image",
        b'IHDR': "PNG image data chunk",
        b'IDAT': "PNG image data chunk",
        b'IEND': "PNG image end marker"
    }
    
    for sig, desc in signatures.items():
        if sig in payload:
            analysis.append(f"Contains {desc} signature")
            suspicion_score += 5 
            
    if len(payload) > 300:
        analysis.append(f"Unusually large DNS payload ({len(payload)} bytes)")
        suspicion_score += 3
    elif len(payload) > 100:
        analysis.append(f"Larger than normal DNS payload ({len(payload)} bytes)")
        suspicion_score += 1
    
    binary_patterns = sum(1 for b in payload if b > 127)
    binary_ratio = binary_patterns / len(payload) if payload else 0
    if binary_ratio > 0.3:
        analysis.append(f"High ratio of binary data ({binary_ratio:.2f})")
        suspicion_score += 2
    
    try:
        text_content = payload.decode('ascii', errors='ignore')
        text_sample = text_content.replace('\x00', '').strip()
        if len(text_sample) > 20: 
            preview = text_sample[:30] + "..." if len(text_sample) > 30 else text_sample
            analysis.append(f"Contains text data: '{preview}'")
            suspicion_score += 1
    except:
        pass
    
    return {
        "packet_num": packet_num,
        "analysis": analysis,
        "suspicion_score": suspicion_score,
        "payload_size": len(payload)
    }

analysis_results = []
for i, payload in enumerate(payloads_array):
    result = analyze_dns_payload(payload, i+1)
    analysis_results.append(result)

sorted_results = sorted(analysis_results, key=lambda x: x["suspicion_score"], reverse=True)

print("Top 10 Most Suspicious DNS Packets\n" + "="*70)
for result in sorted_results[:10]:
    print(f"Packet #{result['packet_num']} - Suspicion Score: {result['suspicion_score']}")
    print(f"Size: {result['payload_size']} bytes")
    print("Findings:")
    for finding in result["analysis"]:
        print(f"  - {finding}")
    print("-"*70)

print("\nOverall Analysis Summary\n" + "="*70)
total_packets = len(analysis_results)
suspicious_packets = sum(1 for r in analysis_results if r["suspicion_score"] > 3)
print(f"Total packets analyzed: {total_packets}")
print(f"Suspicious packets: {suspicious_packets} ({suspicious_packets/total_packets*100:.1f}%)")

png_packets = sum(1 for r in analysis_results if any("PNG" in a for a in r["analysis"]))
large_packets = sum(1 for r in analysis_results if r["payload_size"] > 300)

print(f"Packets containing PNG data: {png_packets}")
print(f"Unusually large packets: {large_packets}")

Top 10 Most Suspicious DNS Packets
Packet #1 - Suspicion Score: 21
Size: 933 bytes
Findings:
  - DNS Query for 'google.com'
  - Contains PNG image file signature
  - Contains PNG image data chunk signature
  - Contains PNG image data chunk signature
  - Unusually large DNS payload (933 bytes)
  - High ratio of binary data (0.66)
  - Contains text data: 'googlecomPNG

IHDR...'
----------------------------------------------------------------------
Packet #57 - Suspicion Score: 11
Size: 560 bytes
Findings:
  - DNS Query for 'google.com'
  - Contains PNG image end marker signature
  - Unusually large DNS payload (560 bytes)
  - High ratio of binary data (0.67)
  - Contains text data: 'googlecomw[y4Q9c}...'
----------------------------------------------------------------------
Packet #3 - Suspicion Score: 6
Size: 947 bytes
Findings:
  - DNS Query for 'google.com'
  - Unusually large DNS payload (947 bytes)
  - High ratio of binary data (0.67)
  - Contains text data: '

Se desarrolló un sistema de puntuación para identificar paquetes sospechosos basado en múltiples factores: tamaño del payload, presencia de firmas de archivos conocidos, patrones binarios inusuales y contenido de texto. Este enfoque permitió clasificar y ordenar los paquetes según su nivel de sospecha.

Los resultados muestran evidencia de exfiltración de datos mediante técnicas de DNS tunneling, donde se está transmitiendo una imagen PNG fragmentada a través de consultas DNS. Este comportamiento es altamente sospechoso porque: 1) los paquetes DNS tienen un tamaño anormalmente grande (933-1017 bytes cuando deberían ser mucho menores), 2) contienen firmas de archivo PNG y bloques de datos de imagen que no deberían estar presentes en tráfico DNS legítimo, 3) muestran una alta proporción de datos binarios (66-73%), y 4) las consultas siempre se dirigen al mismo dominio (google.com) con información no relacionada con resolución de nombres. Esta técnica es utilizada típicamente por atacantes para evadir controles de seguridad de red y extraer información de manera encubierta.