# Log Anomaly Detection Using LogAI for an Video Editing App with online AI Capabilities


## Load Data

In [1]:
import os

from logai.dataloader.openset_data_loader import OpenSetDataLoader, OpenSetDataLoaderConfig

#File Configuration
filepath = os.path.join("..", "datasets", "video_editing_app_log_anomalous.log") # Update the file path to the game log

dataset_name = "video_editing"
data_loader = OpenSetDataLoader(
    OpenSetDataLoaderConfig(
        dataset_name=dataset_name,
        filepath=filepath)
)

logrecord = data_loader.load_data()

logrecord.to_dataframe().head(500)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected[constants.LOG_TIMESTAMPS] = pd.to_datetime(


Unnamed: 0,logline,timestamp,User,Service
0,Generic Details,2023-12-20 23:33:25.437531,user101,Scene Detection
1,Generic Details,2023-12-20 23:33:30.437531,user101,Video Enhancement
2,Generic Details,2023-12-20 23:33:32.437531,user102,Scene Detection
3,Generic Details,2023-12-20 23:33:33.437531,user105,Scene Detection
4,Generic Details,2023-12-20 23:33:37.437531,user103,Color Correction
...,...,...,...,...
495,Generic Details,2023-12-20 23:57:35.437531,user103,Subtitle Generation
496,Generic Details,2023-12-20 23:57:38.437531,user102,Subtitle Generation
497,Generic Details,2023-12-20 23:57:42.437531,user101,Subtitle Generation
498,Generic Details,2023-12-20 23:57:44.437531,user101,Subtitle Generation


## Preprocess

In preprocessing step user can retrieve and replace any regex strings and clean the raw loglines. This
can be very useful to improve information extraction of the unstructured part of logs,
as well as generate more structured attributes with domain knowledge.

Unnecessary for our specific use case.

In [2]:
from logai.preprocess.preprocessor import PreprocessorConfig, Preprocessor
from logai.utils import constants

loglines = logrecord.body[constants.LOGLINE_NAME]
attributes = logrecord.attributes

preprocessor_config = PreprocessorConfig(
    custom_replace_list=[
    ]
)

preprocessor = Preprocessor(preprocessor_config)

clean_logs, custom_patterns = preprocessor.clean_log(
    loglines
)



## Parsing

After preprocessing, we call auto-parsing algorithms to automatically parse the cleaned logs.


In [3]:
from logai.information_extraction.log_parser import LogParser, LogParserConfig
from logai.algorithms.parsing_algo.drain import DrainParams

# parsing
parsing_algo_params = DrainParams(
    sim_th=0.5, depth=5
)

log_parser_config = LogParserConfig(
    parsing_algorithm="drain",
    parsing_algo_params=parsing_algo_params
)

parser = LogParser(log_parser_config)
parsed_result = parser.parse(clean_logs)

parsed_loglines = parsed_result['parsed_logline']

## Time-series Anomaly Detection

Here we show an example to conduct time-series anomaly detection with parsed logs.

### Feature Extraction

After parsing the logs and get log templates, we can extract timeseries features by coverting
these parsed loglines into counter vectors.

In [4]:
from logai.information_extraction.feature_extractor import FeatureExtractorConfig, FeatureExtractor

config = FeatureExtractorConfig(
    group_by_time="15 min",
    group_by_category=['parsed_logline', 'Service', 'User'],
)

feature_extractor = FeatureExtractor(config)

timestamps = logrecord.timestamp['timestamp']
parsed_loglines = parsed_result['parsed_logline']
counter_vector = feature_extractor.convert_to_counter_vector(
    log_pattern=parsed_loglines,
    attributes=attributes,
    timestamps=timestamps
)

print(len(counter_vector))



416574


### Anomaly Detection

With the generated `counter_vector`, you can use `AnomalyDetector` to detect timeseries anomalies.
Here we use an algorithm in Merlion library called `DynamicBaseLine`.

In [5]:
from logai.algorithms.anomaly_detection_algo.dbl import DBLDetectorParams
from logai.analysis.anomaly_detector import AnomalyDetector, AnomalyDetectionConfig
from sklearn.model_selection import train_test_split
import pandas as pd

counter_vector["attribute"] = counter_vector.drop(
                [
                    constants.LOG_COUNTS,
                    constants.LOG_TIMESTAMPS,
                    constants.EVENT_INDEX
                ],
                axis=1
            ).apply(
                lambda x: "-".join(x.astype(str)), axis=1
            )

attr_list = counter_vector["attribute"].unique()


params = DBLDetectorParams(
)


anomaly_detection_config = AnomalyDetectionConfig(
    algo_name='dbl',
    algo_params=params
)

res = pd.DataFrame()
for attr in attr_list:
    temp_df = counter_vector[counter_vector["attribute"] == attr]
    if temp_df.shape[0] >= constants.MIN_TS_LENGTH:
        train, test = train_test_split(
            temp_df[[constants.LOG_TIMESTAMPS, constants.LOG_COUNTS]],
            shuffle=False,
            train_size=0.5,
        )
        anomaly_detector = AnomalyDetector(anomaly_detection_config)
        anomaly_detector.fit(train)
        anom_score = anomaly_detector.predict(test)
        res = res._append(anom_score)
        
anomaly_threshold = 5.0
filtered_res = res[res['anom_score'] >= anomaly_threshold]

In [6]:
# Get anomalous datapoints
anomalies = counter_vector.iloc[filtered_res[filtered_res>0].index]
print(len(anomalies))
anomalies.head(50000)

1


Unnamed: 0,parsed_logline,Service,User,timestamp,event_index,counts,attribute
14163,Generic Details,Audio Sync,user101,2024-05-16 12:15:00,"[4249905, 4249909, 4249953, 4249965, 4249977, ...",5010,Generic Details-Audio Sync-user101
