# Анализ логов

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType, DateType, TimestampType

from datetime import datetime
from pyspark.sql.functions import *

!pip install pandas
import pandas as pd

!pip install pyyaml ua-parser user-agents
from user_agents import parse
# библиотека для парсинга user-agent


# импорт модулей и библиотек



In [91]:
spark = SparkSession \
    .builder \
    .appName("Log analysis") \
    .config("spark.jars", "C:/Users/raspa/Desktop/spark-3.3.1-bin-hadoop3/jars/postgresql-42.5.1.jar") \
    .config("spark.driver.bindAddress","localhost") \
    .config("spark.ui.port","4040") \
    .getOrCreate()

# создание spark сессии

In [4]:
client_hostname = spark.read.csv('C:/Users/raspa/Desktop/data_ignore/client_hostname.csv', header=True)
# csv файл с данными клиентов

In [None]:
client_hostname.show(15)

In [6]:
def edit_alias_list(str):
    if str == "[Errno 1] Unknown host":
        return "unknown"
    else:
        return str[2:len(str)-2]
    
    
def edit_address_list(str):
    if str == "null":
        return "unknown"
    elif str is None:
        return "unknown"
    else:
        return str[2:len(str)-2]
    
# функция для замены пропущенных значений и редактировании строк(очищает от скобок и кавычек в строке)

In [7]:
func_alias_list = udf(edit_alias_list, StringType())
func_address_list = udf(edit_address_list, StringType())

In [8]:
client_hostname = client_hostname.withColumn("alias_list_new", func_alias_list("alias_list"))
client_hostname = client_hostname.drop("alias_list")
client_hostname = client_hostname.withColumn("address_list_new", func_address_list("address_list"))
client_hostname = client_hostname.drop("address_list")

In [9]:
client_hostname = client_hostname.withColumnRenamed("alias_list_new", "alias_list")\
       .withColumnRenamed("address_list_new", "address_list")

In [10]:
df = pd.read_table('C:/Users/raspa/Desktop/data_ignore/access.log', engine='python', header=None, 
                   names=["ip", "datetime", "request", "code_req", "port", "user_agent"], 
                   sep=" - - \[{1}(.+)\] \"(.*?)\" (\d+) (\d+) .+ \"(.{2,}?)\".*",  index_col=False)

# загрузка главного файла с данными
# первичная обработка с помощью RexEx

In [26]:
df.code_req = df.code_req.fillna(0)
df.port = df.port.fillna(0)
df = df.astype({"code_req": int, "port": int, "user_agent": str})


# замена пропущенных значений в столбцах на 0
# для конвертации в тип *integer*

In [14]:
format_date = '%d/%b/%Y:%H:%M:%S +0330'

In [15]:
df.datetime = df.datetime.apply(lambda x: datetime.strptime(x, format_date) if x is not(None) else 0)

# перевод даты из строки в формат datetime

In [None]:
df["browser"] = df.user_agent.apply(lambda x: parse(x).browser.family)

# вытащил из user-agent название браузера

In [None]:
df["device"] = df.user_agent.apply(lambda x: parse(x).device.brand)

# здесь вытащил бренд смартфона

In [96]:
df.head(5)

Unnamed: 0,ip,datetime,request,code_req,port,user_agent
0,54.36.149.41,2019-01-22 03:56:14,GET /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%D...,200,30577,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
1,31.56.96.51,2019-01-22 03:56:16,GET /image/60844/productModel/200x200 HTTP/1.1,200,5667,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
2,31.56.96.51,2019-01-22 03:56:16,GET /image/61474/productModel/200x200 HTTP/1.1,200,5379,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
3,40.77.167.129,2019-01-22 03:56:17,GET /image/14925/productModel/100x100 HTTP/1.1,200,1696,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
4,91.99.72.15,2019-01-22 03:56:17,GET /product/31893/62100/%D8%B3%D8%B4%D9%88%D8...,200,41483,Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16...


In [72]:
df2 = df[0:100000]
df2["browser"] = df2.user_agent.apply(lambda x: parse(x).browser.family)
df2["device"] = df2.user_agent.apply(lambda x: parse(x).device.brand if parse(x).device.brand is not(None) else parse(x).os.family)

# взял 100 тысяч строк из 10 миллионов, так как вылезает ошибка при создании датафрейма *java.lang.OutOfMemoryError: Java heap space*
# и добавление новых столбцов занимает очень долгое время

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["browser"] = df2.user_agent.apply(lambda x: parse(x).browser.family)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["device"] = df2.user_agent.apply(lambda x: parse(x).device.brand if parse(x).device.brand is not(None) else parse(x).os.family)


In [77]:
schema = StructType(fields=[
    StructField("ip", StringType()),
    StructField("datetime", TimestampType()),
    StructField("request", StringType()),
    StructField("code_req", IntegerType()),
    StructField("port", IntegerType()),
    StructField("user_agent", StringType()),
    StructField("browser", StringType()),
    StructField("device", StringType())
])

# схема для создания DataFrame pyspark из Pandas DataFrame

In [78]:
logs = spark.createDataFrame(df2, schema=schema)


In [79]:
logs.show(5)

+-------------+-------------------+--------------------+--------+-----+--------------------+-------------+-------+
|           ip|           datetime|             request|code_req| port|          user_agent|      browser| device|
+-------------+-------------------+--------------------+--------+-----+--------------------+-------------+-------+
| 54.36.149.41|2019-01-22 03:56:14|GET /filter/27|13...|     200|30577|Mozilla/5.0 (comp...|    AhrefsBot| Spider|
|  31.56.96.51|2019-01-22 03:56:16|GET /image/60844/...|     200| 5667|Mozilla/5.0 (Linu...|Chrome Mobile| Huawei|
|  31.56.96.51|2019-01-22 03:56:16|GET /image/61474/...|     200| 5379|Mozilla/5.0 (Linu...|Chrome Mobile| Huawei|
|40.77.167.129|2019-01-22 03:56:17|GET /image/14925/...|     200| 1696|Mozilla/5.0 (comp...|      bingbot| Spider|
|  91.99.72.15|2019-01-22 03:56:17|GET /product/3189...|     200|41483|Mozilla/5.0 (Wind...|      Firefox|Windows|
+-------------+-------------------+--------------------+--------+-----+---------

In [None]:
logs.select("ip", "datetime", "request", "code_req", "port", "user_agent", "browser", "device").write.format("jdbc")\
    .option("url", "jdbc:postgresql://localhost:5432/logs") \
    .mode("Overwrite") \
    .option("driver", "org.postgresql.Driver").option("dbtable", "log") \
    .option("user", "postgres").option("password", "0000").save()

# сохранение датафрейма logs в БД postgres

In [None]:
client_hostname.select("client", "hostname", "alias_list", "address_list").write.format("jdbc")\
    .option("url", "jdbc:postgresql://localhost:5432/logs") \
    .mode("Overwrite") \
    .option("driver", "org.postgresql.Driver").option("dbtable", "hostname") \
    .option("user", "postgres").option("password", "0000").save()

# сохранение датафрейма client_hostname в БД postgres