## import libraries

In [6]:
# Libraries and setup

# Auto reload changes
%load_ext autoreload
%autoreload 2

from src.utils.dependencies import *
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
plt.style.use('fivethirtyeight')

# log file schema
from src.log_file_schema import schema

# data handler
from src.data_handler import DataHandler
dh = DataHandler()
n=5

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create or Get Spark session

In [7]:
#create or get spark session
spark = SparkSession.builder \
    .master("local[*]")\
    .appName("PayPayChallenge")\
    .getOrCreate()

In [8]:

def duration(start, end):
    try:
        num_of_seconds = (end - start).total_seconds()
    except:
        num_of_seconds = 0
    return num_of_seconds;

get_duration = udf(duration, FloatType())

def preprocess_data(spark):

    df = spark.read.csv(log_file, schema=schema, sep=" ").repartition(num_partitions).cache()
    split_client = split(df["client:port"], ":")
    split_backend = split(df["backend:port"], ":")
    split_request = split(df["request"], " ")

    df=df.withColumn("ip", split_client.getItem(0)) \
                .withColumn("client_port", split_client.getItem(1)) \
                .withColumn("backend_ip", split_backend.getItem(0)) \
                .withColumn("backend_port", split_backend.getItem(1)) \
                .withColumn("request_action", split_request.getItem(0)) \
                .withColumn("request_url", split_request.getItem(1)) \
                .withColumn("request_protocol", split_request.getItem(2)) \
                .withColumn("current_timestamp", col("timestamp").cast("timestamp")) \
                .drop("client:port","backend:port","request").cache()

    df=df.select(["ip", "request_url"]);
    
    
    df=df.na.drop(subset=["request_url"])
    df=df.na.drop(subset=["ip"])
    
    df = df.groupby("ip").agg(countDistinct("request_url").alias("count_unique_URLs"));
    df=df.na.drop(subset=["count_unique_URLs"])
    
    splitt2=split(df["ip"], "\\.");
    df=df.withColumn("octet0", splitt2.getItem(0));
    df=df.withColumn("octet1", splitt2.getItem(1));
    df=df.withColumn("octet2", splitt2.getItem(2));
    df=df.withColumn("octet3", splitt2.getItem(3));
    df=df.drop("ip");
    df=df.na.drop(subset=["octet0"])
    df=df.na.drop(subset=["octet1"])
    df=df.na.drop(subset=["octet2"])
    df=df.na.drop(subset=["octet3"])
    #print(df.dtypes);
    
    return df


def data_loader(spark):
    dataset3 = preprocess_data(spark).cache()
    dataset3.show();
    return dataset3.select("*").toPandas();

df=data_loader(spark)

+-----------------+------+------+------+------+
|count_unique_URLs|octet0|octet1|octet2|octet3|
+-----------------+------+------+------+------+
|               84|   113|   193|   114|    25|
|               85|   115|   112|   250|   108|
|                2|   117|   203|   181|   144|
|                7|   120|    61|    47|    36|
|               88|   124|   125|    22|   218|
|              108|    14|   139|    82|   134|
|                9|   117|   247|   188|    13|
|              112|    27|    34|   244|   251|
|                3|   117|   207|    97|   173|
|               16|    61|    16|   142|   162|
|               34|   117|   241|   152|    20|
|               10|   123|   136|   182|   137|
|              110|   202|    53|    89|   132|
|               16|   202|   174|    92|    10|
|               14|    59|   160|   110|   163|
|               16|   117|   205|    39|   248|
|                6|    27|    63|   186|    72|
|               94|   103|    16|    71|

In [9]:
df=df[['octet0', 'octet1', 'octet2', 'octet3', 'count_unique_URLs']]

print(df.head());
print(df.shape)

  octet0 octet1 octet2 octet3  count_unique_URLs
0    113    193    114     25                 84
1    115    112    250    108                 85
2    117    203    181    144                  2
3    120     61     47     36                  7
4    124    125     22    218                 88
(90544, 5)


In [10]:
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

df=df.apply(pd.to_numeric) ;

X = df[['octet0', 'octet1', 'octet2', 'octet3']]
Y =df[['count_unique_URLs']]
model = xgboost.XGBRegressor(objective='reg:squarederror')
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
#print(results);
print("RMSE:", np.mean(np.sqrt(np.abs(results))))



RMSE: 94.75591607234915


In [11]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X, Y)
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model_rf, X, Y, cv=kfold, scoring='neg_mean_squared_error')
#print(results);
print("RMSE Randomforest:", np.mean(np.sqrt(np.abs(results))))





  model_rf.fit(X, Y)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


RMSE Randomforest: 81.32694452233947
