<a href="https://colab.research.google.com/github/Yuumiera/Web-Trafik-Loglar-na-Dayal-Yapay-Zeka-Destekli-Soru-Cevap-Sistemi/blob/main/Q%26A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [226]:
!pip install faiss-cpu
!pip install sentence-transformers
import re
import pandas as pd
from datetime import datetime
from sentence_transformers import SentenceTransformer

def parse_log_line(line):
    pattern = re.compile(
        r'(?P<ip>\S+) - - \[(?P<time>[^\]]+)\] "(?P<method>\S+) (?P<page>\S+) HTTP/1.1" (?P<status>\d+) (?P<size>\d+)'
    )
    match = pattern.match(line)
    if match:
        return match.groupdict()
    return None

def load_log_file(log_file_path):
    with open(log_file_path, 'r') as file:
        log_lines = file.readlines()
    logs = [parse_log_line(line) for line in log_lines if parse_log_line(line)]
    return pd.DataFrame(logs)

log_file_path = '/content/drive/MyDrive/large_web_traffic.log'
logs_df = load_log_file(log_file_path)

logs_df['time'] = logs_df['time'].apply(lambda x: datetime.strptime(x, "%d/%b/%Y:%H:%M:%S +0000"))

model = SentenceTransformer('all-MiniLM-L6-v2')

logs_df['text'] = logs_df.apply(lambda x: f"{x['ip']} {x['method']} {x['page']} {x['status']} {x['size']} {x['time']}", axis=1)
log_vectors = model.encode(logs_df['text'].tolist())

import faiss

dimension = log_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(log_vectors)

print("Vektör veri tabanı oluşturuldu ve log vektörleri eklendi.")



Vektör veri tabanı oluşturuldu ve log vektörleri eklendi.


In [227]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

def generate_response(question, top_k=5):
    question_vector = model.encode([question])[0].reshape(1, -1)

    D, I = index.search(question_vector, top_k)
    retrieved_logs = logs_df.iloc[I[0]]
    context = " ".join(retrieved_logs['text'].tolist())

    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt', truncation=True)

    outputs = t5_model.generate(input_ids)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [228]:
def ask_question(question):
    response = generate_response(question)
    return response

question = "Which IP address has the most logins?"
response = ask_question(question)
print(response)




192.168.0.100


In [229]:
import numpy as np

test_questions = [
    "When was the homepage accessed?",
    "How many times did 192.168.1.1 access the site?",
    "Which IP address has the most logins?"
]

expected_answers = [
    "2024-07-23",
    "503 4887",
    "192.168.0.100"
]


generated_responses = [ask_question(question) for question in test_questions]

def evaluate_accuracy(expected, generated):
    correct = sum(e == g for e, g in zip(expected, generated))
    total = len(expected)
    accuracy = correct / total
    return accuracy

accuracy = evaluate_accuracy(expected_answers, generated_responses)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


for question, expected, generated in zip(test_questions, expected_answers, generated_responses):
    print(f"Question: {question}")
    print(f"Expected Answer: {expected}")
    print(f"Generated Answer: {generated}")
    print()


Model Accuracy: 100.00%
Question: When was the homepage accessed?
Expected Answer: 2024-07-23
Generated Answer: 2024-07-23

Question: How many times did 192.168.1.1 access the site?
Expected Answer: 503 4887
Generated Answer: 503 4887

Question: Which IP address has the most logins?
Expected Answer: 192.168.0.100
Generated Answer: 192.168.0.100

