Aşama 1: Veri Hazırlığı ve Ön İşleme

In [None]:
!pip install faker



In [None]:
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()


LOG_LINE_COUNT = 20


HTTP_METHODS = ['GET', 'POST', 'PUT', 'DELETE']
HTTP_STATUS_CODES = [200, 201, 301, 400, 401, 403, 404, 500, 502, 503]
URLS = [
    '/home',
    '/about',
    '/contact',
    '/products',
    '/products/item1',
    '/products/item2',
    '/api/data',
    '/login',
    '/signup'
]
USER_AGENTS = [
    fake.firefox,
    fake.chrome,
    fake.safari,
    fake.internet_explorer,
    fake.opera
]

def generate_log_line():
    ip_address = fake.ipv4()
    identity = '-'
    userid = '-'
    time = (datetime.now() - timedelta(seconds=random.randint(0, 86400))).strftime('%d/%b/%Y:%H:%M:%S %z')
    method = random.choice(HTTP_METHODS)
    resource = random.choice(URLS)
    protocol = 'HTTP/1.1'
    status_code = random.choice(HTTP_STATUS_CODES)
    response_size = random.randint(200, 5000)
    referrer = fake.url()
    user_agent = random.choice(USER_AGENTS)()

    log_line = f'{ip_address} {identity} {userid} [{time}] "{method} {resource} {protocol}" {status_code} {response_size} "{referrer}" "{user_agent}"'
    return log_line


with open('web_traffic.log', 'w') as log_file:
    for _ in range(LOG_LINE_COUNT):
        log_line = generate_log_line()
        log_file.write(log_line + '\n')



In [None]:
import pandas as pd
import re


with open("web_traffic.log") as file:
    log_lines = file.readlines()


log_entries = []


for line in log_lines:
    match = re.match(r'(\S+) - - \[(.*?)\] "(.*?)" (\d{3}) (\d+) "(.*?)" "(.*?)"', line)
    if match:
        log_entries.append({
            "ip": match.group(1),
            "timestamp": match.group(2),
            "request": match.group(3),
            "status_code": match.group(4),
            "response_size": match.group(5),
            "referrer": match.group(6),
            "user_agent": match.group(7),
        })


df_logs = pd.DataFrame(log_entries)
df_logs.head()
print(df_logs)




                 ip              timestamp                    request  \
0   135.226.194.162  22/Aug/2024:14:43:50       DELETE /home HTTP/1.1   
1    111.33.112.105  22/Aug/2024:00:21:44         PUT /login HTTP/1.1   
2     135.103.209.4  21/Aug/2024:23:17:01        PUT /signup HTTP/1.1   
3      119.42.32.94  22/Aug/2024:06:49:47       PUT /contact HTTP/1.1   
4       19.23.86.56  21/Aug/2024:17:46:02      DELETE /login HTTP/1.1   
5    28.253.133.198  22/Aug/2024:15:01:49          GET /home HTTP/1.1   
6   209.101.228.137  22/Aug/2024:01:02:15      POST /contact HTTP/1.1   
7    191.10.210.131  22/Aug/2024:13:53:28        POST /login HTTP/1.1   
8    31.113.104.211  22/Aug/2024:16:44:25     DELETE /signup HTTP/1.1   
9     173.71.23.229  22/Aug/2024:11:08:37      PUT /api/data HTTP/1.1   
10   117.229.24.116  21/Aug/2024:23:42:44   DELETE /api/data HTTP/1.1   
11    69.231.229.15  22/Aug/2024:03:59:50        GET /signup HTTP/1.1   
12     83.172.60.61  21/Aug/2024:22:10:23       PUT

In [None]:

print(df_logs.isnull().sum())

df_logs = df_logs.dropna()


df_logs["status_code"] = df_logs["status_code"].astype(int)


df_logs.head()


ip               0
timestamp        0
request          0
status_code      0
response_size    0
referrer         0
user_agent       0
dtype: int64


Unnamed: 0,ip,timestamp,request,status_code,response_size,referrer,user_agent
0,145.89.204.118,22/Aug/2024:12:54:41,POST /about HTTP/1.1,400,1065,http://fields.com/,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/533.2...
1,20.24.201.23,22/Aug/2024:14:12:50,DELETE /login HTTP/1.1,404,1214,http://www.burgess-duncan.com/,Mozilla/5.0 (compatible; MSIE 8.0; Windows 98;...
2,24.242.222.78,22/Aug/2024:13:29:33,GET /api/data HTTP/1.1,403,765,http://shaw.com/,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_6_5) A...
3,40.247.248.162,22/Aug/2024:08:05:26,GET /products/item1 HTTP/1.1,401,1965,https://walls.org/,Mozilla/5.0 (compatible; MSIE 6.0; Windows CE;...
4,97.25.123.112,22/Aug/2024:00:05:00,POST /about HTTP/1.1,500,1452,http://thompson.org/,Mozilla/5.0 (compatible; MSIE 6.0; Windows CE;...


In [None]:
df_logs.to_json("cleaned_web_traffic.json", orient="records", lines=True)



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


df_logs["combined"] = (
    df_logs["ip"] + " " +
    df_logs["timestamp"] + " " +
    df_logs["request"] + " " +
    df_logs["status_code"].astype(str) + " " +
    df_logs["response_size"].astype(str) + " " +
    df_logs["referrer"] + " " +
    df_logs["user_agent"]
)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_logs["combined"])


print(X.shape)
print(X)


(20, 257)
  (0, 237)	0.11543857975332131
  (0, 155)	0.23495179002769756
  (0, 91)	0.20652621973676666
  (0, 173)	0.14712548609492937
  (0, 187)	0.11543857975332131
  (0, 207)	0.11543857975332131
  (0, 205)	0.11543857975332131
  (0, 131)	0.4130524394735333
  (0, 167)	0.11543857975332131
  (0, 222)	0.13776412327679582
  (0, 254)	0.09853165271947849
  (0, 214)	0.08917028990134496
  (0, 174)	0.10372707232389981
  (0, 183)	0.23495179002769756
  (0, 11)	0.23495179002769756
  (0, 106)	0.18635795665224672
  (0, 193)	0.14021216485709512
  (0, 165)	0.18635795665224672
  (0, 229)	0.15793238636131576
  (0, 110)	0.23495179002769756
  (0, 135)	0.23495179002769756
  (0, 21)	0.15793238636131576
  (0, 61)	0.07010608242854756
  (0, 168)	0.07010608242854756
  (0, 70)	0.07352656969951214
  :	:
  (19, 43)	0.1897956209935136
  (19, 153)	0.16683325597034865
  (19, 2)	0.16683325597034865
  (19, 127)	0.16683325597034865
  (19, 213)	0.15054119871027938
  (19, 199)	0.3336665119406973
  (19, 177)	0.15054119871027

In [None]:
!pip install faiss-cpu




In [None]:
import faiss


index = faiss.IndexFlatL2(X.shape[1])


index.add(X.toarray())


faiss.write_index(index, "faiss_index.index")


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_query(query, vectorizer):
    return vectorizer.transform([query]).toarray()


def retrieve_top_k(query_vector, index, k=1):
    D, I = index.search(query_vector, k)
    return I[0], D[0]


user_query = "Hangi sayfalar 500  status kodunu almıştır ?"


query_vector = vectorize_query(user_query, vectorizer)


indices, distances = retrieve_top_k(query_vector, index)


matching_logs = df_logs.iloc[indices]
matching_logs


Unnamed: 0,ip,timestamp,request,status_code,response_size,referrer,user_agent,combined
4,97.25.123.112,22/Aug/2024:00:05:00,POST /about HTTP/1.1,500,1452,http://thompson.org/,Mozilla/5.0 (compatible; MSIE 6.0; Windows CE;...,97.25.123.112 22/Aug/2024:00:05:00 POST /abou...


Aşama 2: RAG Modelinin Kurulumu

In [None]:
!pip install transformers
!pip install torch




In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd


model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

df_logs = pd.DataFrame(log_entries)


def generate_response(logs, user_query):

    prompt = f"Kullanıcı sorusu: {user_query}\n"
    prompt += "İlgili log kayıtları:\n"
    for log in logs.to_dict(orient="records"):
        prompt += f"{log['timestamp']} - {log['request']} - {log['status_code']}\n"
    prompt += "\nBu bilgilere dayanarak, kullanıcı sorusuna yanıt verin."


    inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)


    outputs = model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)


    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Aşama 3: Sistem Entegrasyonu ve Test

In [None]:

def rag_system(user_query, df_logs):

    response = generate_response(df_logs, user_query)
    return response

user_query = "500 hata kodları hakkında bilgi ver"


response = rag_system(user_query, df_logs)
print("Yanıt:", response)

Yanıt: :14:43:50 - DELETE /home HTTP/1.1 - 503 22/Aug/2024:00:21:44 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:50 - PUT /signup HTTP/1.1 - 502 22/Aug/2024:13:53:50 - PUT /signup HTTP/1.1 - 200 22/Aug/2024:


In [None]:
def command_line_interface():
    print("Web Trafik Loglarına Dayalı Soru-Cevap Sistemi")
    print("Çıkmak için 'exit' yazın.")

    while True:

        user_query = input("\nSorunuzu yazın: ")


        if user_query.lower() in ['exit', 'quit']:
            print("Çıkılıyor...")
            break

        response = rag_system(user_query,df_logs)


        print("\nYanıt:", response)


command_line_interface()


Web Trafik Loglarına Dayalı Soru-Cevap Sistemi
Çıkmak için 'exit' yazın.

Sorunuzu yazın: 404 hatası veren sayfalar nelerdir?

Yanıt: : 404 hatas veren sayfalar nelerdir? Kullanc sorusu: 404 hatas veren sayfalar nelerdir? lgili log kaytlar: 22/Aug/2024:12:50 - DELETE /login HTTP/1.1 - 400 22/Aug/2024:13:29:33 - GET /api/data

Sorunuzu yazın: exit
Çıkılıyor...


In [None]:

test_queries = [
    "503 status code veren sayfalar nelerdir?",
    "Son ziyaret edilen sayfalar hangileri?",
    "Siteye erişim tarihleri nelerdir?",
    "En yüksek yanıt boyutuna sahip istekler nelerdir?",
    "Hangi HTTP yöntemleri en sık kullanılmıştır?",
    "En fazla 403 hata kodu alan istekler nelerdir?",
]

print("Test Senaryoları Başlatılıyor...\n")

for query in test_queries:
    print(f"Soru: {query}")
    response = rag_system(query, df_logs)
    print("Yanıt:", response)
    print("\n" + "-"*50 + "\n")

print("Testler Tamamlandı.")


Test Senaryoları Başlatılıyor...

Soru: 503 status code veren sayfalar nelerdir?
Yanıt: :14:43:50 - DELETE /home HTTP/1.1 - 503 22/Aug/2024:00:21:44 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:50 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:28 - POST /signup HTTP/1.1 - 200 21/Aug/2024:

--------------------------------------------------

Soru: Son ziyaret edilen sayfalar hangileri?
Yanıt: :14:43:50 - DELETE /home HTTP/1.1 - 503 22/Aug/2024:00:21:44 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:50 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:28 - POST /signup HTTP/1.1 - 200 21/Aug/2024:

--------------------------------------------------

Soru: Siteye erişim tarihleri nelerdir?
Yanıt: : Siteye erişim tarihleri nelerdir? Siteye erişim tarihleri nelerdir?

--------------------------------------------------

Soru: En yüksek yanıt boyutuna sahip istekler nelerdir?
Yanıt: Kullanc sorusu: En yüksek yant boyutuna sahip istekler nelerdir? lgili log kaytlar: 22/Aug/2024:14:50 - DELETE /h

Aşama 4: Performans Değerlendirmesi

In [None]:
evaluation_queries = {
    "503 status code veren sayfalar nelerdir?": "/home",
    "Son ziyaret edilen sayfalar hangileri?": "/home",
    "Siteye erişim tarihleri nelerdir?": "21/Aug/2024, 22/Aug/2024",
    "En yüksek yanıt boyutuna sahip istekler nelerdir?": "/home, /signup",
    "Hangi HTTP yöntemleri en sık kullanılmıştır?": "PUT, DELETE",
    "En fazla 403 hata kodu alan istekler nelerdir?": "PUT /signup"
}





correct_answers = 0
total_queries = len(evaluation_queries)

print("Doğruluk Değerlendirmesi Başlıyor...\n")

for query, expected_answer in evaluation_queries.items():
    print(f"Soru: {query}")
    response = rag_system(query, df_logs)
    print("Yanıt:", response)


    if expected_answer.lower() in response.lower():
        print("Doğru")
        correct_answers += 1
    else:
        print("Yanlış")

    print("\n" + "-"*50 + "\n")

accuracy = correct_answers / total_queries
print(f"Doğruluk: {accuracy * 100:.2f}%")


Doğruluk Değerlendirmesi Başlıyor...

Soru: 503 status code veren sayfalar nelerdir?
Yanıt: :14:43:50 - DELETE /home HTTP/1.1 - 503 22/Aug/2024:00:21:44 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:50 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:28 - POST /signup HTTP/1.1 - 200 21/Aug/2024:
Doğru

--------------------------------------------------

Soru: Son ziyaret edilen sayfalar hangileri?
Yanıt: :14:43:50 - DELETE /home HTTP/1.1 - 503 22/Aug/2024:00:21:44 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:50 - PUT /signup HTTP/1.1 - 401 21/Aug/2024:13:53:28 - POST /signup HTTP/1.1 - 200 21/Aug/2024:
Doğru

--------------------------------------------------

Soru: Siteye erişim tarihleri nelerdir?
Yanıt: : Siteye erişim tarihleri nelerdir? Siteye erişim tarihleri nelerdir?
Yanlış

--------------------------------------------------

Soru: En yüksek yanıt boyutuna sahip istekler nelerdir?
Yanıt: Kullanc sorusu: En yüksek yant boyutuna sahip istekler nelerdir? lgili log kaytlar: 22/Aug