In [13]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from user_agents import parse
from urllib.parse import urlparse
import re
from sklearn.impute import SimpleImputer

# Funções de Transformação

# 1. Extrair a porta do host e remover o campo 'host'
def extract_port(df):
    df['port'] = df['host'].apply(lambda x: urlparse(f"http://{x}").port)
    return df.drop(columns=['host'])

# 2. Extrair informações de segurança da URL
def extract_url_info(df):
    def detect_patterns(text):
        return {
            "sql_injection": bool(re.search(r"\b(SELECT|DROP|INSERT|UPDATE|DELETE)\b\s*[\(\)=;'\"-]", text, re.IGNORECASE)),
            "xss_attack": bool(re.search(r"(<script>|alert|onload)", text, re.IGNORECASE)),
            "path_traversal": bool(re.search(r"(\.\./|%2e%2e)", text, re.IGNORECASE)),
            "hex_encoding": bool(re.search(r"%27|%3B|%3D|%22", text, re.IGNORECASE)),
        }

    # Aplicar a detecção de padrões tanto na URL quanto no content
    attack_flags_url = df['URL'].apply(lambda url: pd.Series(detect_patterns(url)))
    attack_flags_content = df['content'].apply(
        lambda content: pd.Series(detect_patterns(content)) if pd.notna(content) else pd.Series({
            "sql_injection": False,
            "xss_attack": False,
            "path_traversal": False,
            "hex_encoding": False
        })
    )

    # Combinar os resultados das colunas URL e content
    df['sql_injection'] = (attack_flags_url['sql_injection'] | attack_flags_content['sql_injection']).astype(int)
    df['xss_attack'] = (attack_flags_url['xss_attack'] | attack_flags_content['xss_attack']).astype(int)
    df['path_traversal'] = (attack_flags_url['path_traversal'] | attack_flags_content['path_traversal']).astype(int)
    df['hex_encoding'] = (attack_flags_url['hex_encoding'] | attack_flags_content['hex_encoding']).astype(int)

    # Contar parâmetros na URL
    df['param_count'] = df['URL'].apply(lambda url: len(urlparse(url).query.split('&')) if urlparse(url).query else 0)
    
    return df.drop(columns=['URL'])

def accept_transformation(df):
    df['accept_present'] = df['Accept'].notna().astype(int)
    return df.drop(columns=['Accept'])

def content_transformation(df):
    df = df.copy()
    df['content'] = df['content'].fillna('')

    # Calcular o comprimento do conteúdo
    df['content_length'] = df['content'].apply(len)
    
    # Contar o número de parâmetros no conteúdo
    df['param_count_content'] = df['content'].apply(
        lambda x: len(urlparse(x).query.split('&')) if urlparse(x).query else 0
    )
    
    return df.drop(columns=['content'])
    
def method_one_hot(df):
    encoder = OneHotEncoder(sparse_output=False, drop='if_binary')
    encoded = encoder.fit_transform(df[['Method']])
    encoded_df = pd.DataFrame(
        encoded, 
        columns=encoder.get_feature_names_out(['Method']),
        index=df.index
    )
    return pd.concat([df.drop(columns=['Method']), encoded_df], axis=1)

def bool_to_int(df):
    for column in df.columns:
        if df[column].dtype == bool:
            df[column] = df[column].astype(int)
    return df

def drop_cols(df):
    # return df.drop(columns=['path_traversal', 'param_count_content'])
    return df.drop(columns=['port'])


# Pipeline de Transformação
preprocess_pipeline = Pipeline([
    ('select_columns', FunctionTransformer(lambda df: df[['Method','URL', 'content', 'Accept']])),
    # ('extract_port', FunctionTransformer(extract_port)),
    ('extract_url_info', FunctionTransformer(extract_url_info)),
    ('accept_transformation', FunctionTransformer(accept_transformation)),
    ('content_transformation', FunctionTransformer(content_transformation)),
    ('method_encoding', FunctionTransformer(method_one_hot)),
    ('bool_to_int', FunctionTransformer(bool_to_int)),
    # ('drop_cols', FunctionTransformer(drop_cols)),
])


In [14]:
df = pd.read_csv('data/logs.csv')

In [15]:
df.columns

Index(['Method', 'URL', 'content', 'Accept', 'host', 'classification'], dtype='object')

In [16]:
X = df.drop(columns=['classification'])
y = df['classification']

X_transformed = preprocess_pipeline.fit_transform(X)


In [17]:
X_transformed.head()

Unnamed: 0,sql_injection,xss_attack,path_traversal,hex_encoding,param_count,accept_present,content_length,param_count_content,Method_GET,Method_POST,Method_PUT
0,0,0,0,0,0,1,0,0,1.0,0.0,0.0
1,0,0,0,0,5,1,0,0,1.0,0.0,0.0
2,0,0,0,0,0,1,68,0,0.0,1.0,0.0
3,0,0,0,0,5,1,0,0,1.0,0.0,0.0
4,0,0,0,0,0,1,63,0,0.0,1.0,0.0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [6]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print(f"Acurácia: {accuracy_score(y_test, y_pred)}")
print(f"Precisão: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")

Acurácia: 0.776331527474847
Precisão: 0.7999369383572442
Recall: 0.7265177548682703


In [19]:
#value_counts in percent
df['classification'].value_counts(normalize=True)

classification
0    0.506578
1    0.493422
Name: proportion, dtype: float64

In [18]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [12, 14, 16, 18, 20, 22],
    'min_samples_split': [8, 10, 12, 14],
    'n_estimators': [4, 6, 8, 10, 12],
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

{'max_depth': 20, 'min_samples_split': 12, 'n_estimators': 10}


In [19]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(f"Acurácia: {accuracy_score(y_test, y_pred)}")
print(f"Precisão: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")

Acurácia: 0.7782311967916696
Precisão: 0.8019224708477781
Recall: 0.7286655211912944


In [22]:
# calculate feature importance
feature_importances = best_model.feature_importances_
features = X_train.columns

importances = pd.Series(feature_importances, index=features).sort_values(ascending=False)
print(importances)

content_length         0.487535
param_count            0.176336
hex_encoding           0.156163
xss_attack             0.070797
Method_POST            0.044944
Method_GET             0.025467
path_traversal         0.025140
accept_present         0.008593
Method_PUT             0.004930
sql_injection          0.000095
param_count_content    0.000000
dtype: float64


In [47]:
from urllib.parse import urlparse, parse_qs

# Exemplo de dados do log
logs = [
    '192.168.100.107 "GET / HTTP/1.1" "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" "-" "192.168.100.121"',
    '192.168.100.107 "GET /icons/openlogo-75.png HTTP/1.1" "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8" "-" "192.168.100.121"',
    '192.168.100.107 "GET /favicon.ico HTTP/1.1" "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8" "-" "192.168.100.121"',
    '192.168.100.107 "GET / HTTP/1.1" "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" "-" "192.168.100.121"',
    '192.168.100.107 "GET /publico/anadir.jsp?id=2&nombre=Jam%C3%B3n+Ib%C3%A9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+--&B1=A%C3%B1adir+al+carrito HTTP/1.1" "*/*" "-" "192.168.100.121"',
    '192.168.100.107 "GET /publico/autenticar.jsp?modo=entrar&login=%3Cscript%3Ealert%28%27XSS%27%29%3C%2Fscript%3E&pwd=password&remember=on&B1=Entrar HTTP/1.1" "*/*" "-" "192.168.100.121"',
    '192.168.100.107 "GET /publico/caracteristicas.jsp?id=..%2F..%2Fetc%2Fpasswd HTTP/1.1" "*/*" "-" "192.168.100.121"',
    '192.168.100.107 "GET /publico/registro.jsp?modo=registro&login=user&password=pass&nombre=test&direccion=127.0.0.1%3B+ls+-la%3B&ciudad=City&cp=12345&provincia=TestProvince&ntc=1234567890123456&B1=Registrar HTTP/1.1" "*/*" "-" "192.168.100.121"'
]

def logs_to_df(logs):
    
    # Expressão regular para extrair dados do log
    pattern = r'(\S+) "(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH) ([^ ]+) HTTP/[^"]+" "([^"]+)" "-" "([^"]+)"'

    # Dados extraídos
    data = []

    for log in logs:
        match = re.search(pattern, log)
        if match:
            ip = match.group(1)
            method = match.group(2)
            url = match.group(3)
            accept = match.group(4)
            host = match.group(5)
            
            # Parse URL to separate path and query parameters
            parsed_url = urlparse(url)
            path = parsed_url.path
            query_params = parse_qs(parsed_url.query)
            
            # Convert query parameters to a string format for 'content'
            content = '&'.join([f"{key}={','.join(value)}" for key, value in query_params.items()])
            
            # Adiciona os dados extraídos para a lista
            data.append([method, path, content, accept, host])

    # Criar o DataFrame
    return pd.DataFrame(data, columns=['Method', 'URL', 'content', 'Accept', 'host'])

# Exibir o DataFrame
X2 = logs_to_df(logs)
X2

Unnamed: 0,Method,URL,content,Accept,host
0,GET,/,,"text/html,application/xhtml+xml,application/xm...",192.168.100.121
1,GET,/icons/openlogo-75.png,,"image/avif,image/webp,image/apng,image/svg+xml...",192.168.100.121
2,GET,/favicon.ico,,"image/avif,image/webp,image/apng,image/svg+xml...",192.168.100.121
3,GET,/,,"text/html,application/xhtml+xml,application/xm...",192.168.100.121
4,GET,/publico/anadir.jsp,id=2&nombre=Jamón Ibérico&precio=85&cantidad='...,*/*,192.168.100.121
5,GET,/publico/autenticar.jsp,modo=entrar&login=<script>alert('XSS')</script...,*/*,192.168.100.121
6,GET,/publico/caracteristicas.jsp,id=../../etc/passwd,*/*,192.168.100.121
7,GET,/publico/registro.jsp,modo=registro&login=user&password=pass&nombre=...,*/*,192.168.100.121


In [48]:
X_transformed2 = preprocess_pipeline.fit_transform(X2)
X_transformed2

Unnamed: 0,sql_injection,xss_attack,path_traversal,hex_encoding,param_count,accept_present,content_length,param_count_content,Method_GET
0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,0,1,0,0,1
2,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,1,0,0,1
4,0,0,0,0,0,1,92,0,1
5,0,1,0,0,0,1,82,0,1
6,0,0,1,0,0,1,19,0,1
7,0,0,0,0,0,1,157,0,1


In [49]:
X_transformed.columns

Index(['sql_injection', 'xss_attack', 'path_traversal', 'hex_encoding',
       'param_count', 'accept_present', 'content_length',
       'param_count_content', 'Method_GET', 'Method_POST', 'Method_PUT'],
      dtype='object')

In [50]:
X_transformed2.columns

Index(['sql_injection', 'xss_attack', 'path_traversal', 'hex_encoding',
       'param_count', 'accept_present', 'content_length',
       'param_count_content', 'Method_GET'],
      dtype='object')

In [51]:
X_transformed2['Method_POST'] = 0
X_transformed2['Method_PUT'] = 0
# X_transformed['port'] = 80

In [52]:
X_transformed2

Unnamed: 0,sql_injection,xss_attack,path_traversal,hex_encoding,param_count,accept_present,content_length,param_count_content,Method_GET,Method_POST,Method_PUT
0,0,0,0,0,0,1,0,0,1,0,0
1,0,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,0,1,0,0,1,0,0
3,0,0,0,0,0,1,0,0,1,0,0
4,0,0,0,0,0,1,92,0,1,0,0
5,0,1,0,0,0,1,82,0,1,0,0
6,0,0,1,0,0,1,19,0,1,0,0
7,0,0,0,0,0,1,157,0,1,0,0


In [53]:
y_pred2 = best_model.predict(X_transformed2)

In [54]:
logs

['192.168.100.107 "GET / HTTP/1.1" "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" "-" "192.168.100.121"',
 '192.168.100.107 "GET /icons/openlogo-75.png HTTP/1.1" "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8" "-" "192.168.100.121"',
 '192.168.100.107 "GET /favicon.ico HTTP/1.1" "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8" "-" "192.168.100.121"',
 '192.168.100.107 "GET / HTTP/1.1" "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" "-" "192.168.100.121"',
 '192.168.100.107 "GET /publico/anadir.jsp?id=2&nombre=Jam%C3%B3n+Ib%C3%A9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+--&B1=A%C3%B1adir+al+carrito HTTP/1.1" "*/*" "-" "192.168.100.121"',
 '192.168.100.107 "GET /publico/autenticar.jsp?modo=entrar&login=%3Cscript%3Ealert%28%27XSS%27%29%3C%2Fscript%3E&pwd=password

In [55]:
pd.DataFrame(y_pred2, columns=['classification'])

Unnamed: 0,classification
0,0
1,0
2,0
3,0
4,1
5,1
6,1
7,1


In [56]:
logs2 = [ 
    '192.168.15.9 "GET /public/anadir.jsp?id=2&name=Manchego+Cheese&price=50&quantity=%27+OR+%271%27%3D%271&B1=Buy HTTP/1.1" "*/*" "-" "192.168.15.12"',
    '192.168.15.9 "GET /public/authenticate.jsp?mode=login&username=%3Csvg+onload%3Dalert%28%27XSS%27%29%3E&password=mypassword&remember=off&B1=Sign+in HTTP/1.1" "*/*" "-" "192.168.15.12"',
    '192.168.15.9 "GET /public/features.jsp?id=..%2F..%2Fconfidential.txt HTTP/1.1" "*/*" "-" "192.168.15.12"',
    '192.168.15.9 "GET /public/register.jsp?mode=register&username=user3&password=pass3&first_name=user3&address=192.168.0.1%3B+rm+-rf+%2F%3B&city=Village&postal_code=67890&state=SampleState&credit_card=4333333333333333&B1=Submit HTTP/1.1" "*/*" "-" "192.168.15.12"'
]

X3 = logs_to_df(logs2)

In [58]:
X_transformed3 = preprocess_pipeline.fit_transform(X3)

In [60]:
X_transformed3['Method_POST'] = 0
X_transformed3['Method_PUT'] = 0

In [61]:
y_pred3 = best_model.predict(X_transformed3)

In [62]:
pd.DataFrame(y_pred3, columns=['classification'])

Unnamed: 0,classification
0,1
1,1
2,1
3,1
