## Initial setup

In [1]:
# imports and constants
import pandas as pd
import pickle
from concurrent.futures import ThreadPoolExecutor
from csv import DictReader
from os import system
from time import time

pd.set_option('max_colwidth', 150)
source_csv_file_path = "../output/logger_calls5.csv"
source_csv_columns = ("repo", "path", "line", "logger", "verbosity", "level", "full_content")
verbosity_levels = ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", "OTHER")

In [2]:
# contents fresh start
system("rm -v contents/*.txt")

0

## Loading data into pandas.Dataframe

In [3]:
# using Pandas Dataframes
df = pd.read_csv(source_csv_file_path)

In [4]:
# pickling logger contents for later use
start = time()

# removing statements which do not have a string argument
df_str = df[df.full_content.str.contains(r".*[\'\"]{1,3}.+[\'\"]{1,3}.*")]

# # filtering per verbosity level
# debug_df = df_str[df_str.verbosity == "DEBUG"]
# info_df = df_str[df_str.verbosity == "INFO"]
# warning_df = df_str[df_str.verbosity == "WARNING"]
# error_df = df_str[df_str.verbosity == "ERROR"]
# critical_df = df_str[df_str.verbosity == "CRITICAL"]
# # other_df = df_str[df_str.verbosity == "OTHER"]  # discarded

print(f"Time spent: {time() - start:.3f} seconds")

Time spent: 0.963 seconds


In [5]:
# pickling logger contents for later use
start = time()

# filtering per verbosity level
for level in verbosity_levels:
    level_df = df_str[df_str.verbosity == level]
    with open(f"contents/{level}.txt", "wb") as f:
        pickle.dump(" ".join(level_df.full_content), f)

print(f"Time spent: {time() - start:.3f} seconds")

Time spent: 0.302 seconds


In [6]:
# loading pickled files
data = {}
for v in verbosity_levels:
    if v == "OTHER":
        continue
    with open(f"contents/{v}.txt", "rb") as f:
        data[v] = pickle.load(f)

## Cleaning data

In [7]:
# first round of text cleaning techniques
from re import escape, sub
from string import punctuation


def clean_text_round1(text: str) -> str:
    text = text.lower()
#     text = sub(r'.*[\'\"]{1,3}(.+)[\'\"]{1,3}.*', r'\1', text)
    text = sub(r'\{[\w\(\)\[\]\'\"]*\}', '', text)
    text = sub(r"%s", "", text)
    text = sub(r'[' + escape(punctuation) + escape("\"'‘’“”…") + r']', '', text)
    text = sub(r'\w*\d\w*', '', text)
    return text


round1 = lambda x: clean_text_round1(x)

In [9]:
start = time()

data_combined = {key: [value] for (key, value) in data.items()}
data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['content']
data_df = data_df.sort_index()
data_df

print(f"Time spent: {time() - start:.3f} seconds")

Time spent: 0.007 seconds


In [10]:
# Double check to make sure data has been loaded properly
data.keys()



In [11]:
# Let's take a look at the updated text
start = time()

data_clean = pd.DataFrame(data_df.content.apply(round1))
data_clean

print(f"Time spent: {time() - start:.3f} seconds")

Time spent: 3.186 seconds


In [12]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [14]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.content)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aa,aaaa,aaaaa,aac,aaccount,aaccountid,aaccountusername,aad,aadclientappid,aadclientid,...,非交易日,非交易日formatdaystrftimeymd,非插件,非法url,非盘中,非高价值需求,页码,预计至少有d条路线,验证用户密码时报错,默认采用等权重方式
CRITICAL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DEBUG,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
ERROR,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,2,0,1,0
INFO,4,1,0,0,1,0,2,9,2,0,...,0,0,0,1,1,0,0,2,0,0
WARNING,2,0,3,1,0,1,0,0,0,1,...,18,1,0,0,0,1,0,0,0,1


In [15]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [16]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))