In [2]:
import re
import pandas as pd
import io

In [3]:
with open("support_logs_2025-07-01.log",encoding = "utf-8") as f:
    content = f.read()

len(content)
    
    

32938

In [4]:
entries = [entry.strip() for entry in content.split("---") if entry.strip() ]
entries[0]

'2025-07-01 00:21:00 [INF0] careplus.support.GenericService - TicketID=TCK0701000 SessionID=sess_TCK0701000\nIP=60.130.155.7 | ResponseTime=1269ms | CPU=27.64% | EventType=generic_event | Error=false\nUserAgent="PostmanRuntime/7.32.2"\nMessage=" event for TCK0701000"\nDebug="ℹ️ Logged for monitoring"\nTraceID=None'

In [5]:
# Regex pattern to extract data 
log_pattern = re.compile(
    r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<log_level>[A-Za-z0-9_]+)\] '
    r'(?P<component>[^\s]+) - TicketID=(?P<ticket_id>[^\s]+) SessionID=(?P<session_id>[^\s]+)\s*'
    r'IP=(?P<ip>.*?) \| ResponseTime=(?P<response_time>-?\d+)ms \| CPU=(?P<cpu>[\d.]+)% \| EventType=(?P<event_type>.*?) \| Error=(?P<error>\w+)\s*'
    r'UserAgent="(?P<user_agent>.*?)"\s*'
    r'Message="(?P<message>.*?)"\s*'
    r'Debug="(?P<debug>.*?)"\s*'
    r'TraceID=(?P<trace_id>.*)'
)
# Extract structured data
parsed_entries = []
for entry in entries:
    match = log_pattern.search(entry)
    if match:
        parsed_entries.append(match.groupdict())
        
parsed_entries[0]  

{'timestamp': '2025-07-01 00:21:00',
 'log_level': 'INF0',
 'component': 'careplus.support.GenericService',
 'ticket_id': 'TCK0701000',
 'session_id': 'sess_TCK0701000',
 'ip': '60.130.155.7',
 'response_time': '1269',
 'cpu': '27.64',
 'event_type': 'generic_event',
 'error': 'false',
 'user_agent': 'PostmanRuntime/7.32.2',
 'message': ' event for TCK0701000',
 'debug': 'ℹ️ Logged for monitoring',
 'trace_id': 'None'}

In [6]:
df = pd.DataFrame(parsed_entries)
df.head()

Unnamed: 0,timestamp,log_level,component,ticket_id,session_id,ip,response_time,cpu,event_type,error,user_agent,message,debug,trace_id
0,2025-07-01 00:21:00,INF0,careplus.support.GenericService,TCK0701000,sess_TCK0701000,60.130.155.7,1269,27.64,generic_event,False,PostmanRuntime/7.32.2,event for TCK0701000,ℹ️ Logged for monitoring,
1,2025-07-01 00:41:00,INFO,careplus.support.GenericService,TCK0701000,sess_TCK0701000,58.36.189.27,1505,57.24,generic_event,False,Mobile-Safari/537.36,event for TCK0701000,ℹ️ Logged for monitoring,
2,2025-07-01 01:44:00,DEBUG,careplus.support.GenericService,TCK0701001,sess_TCK0701001,181.18.12.170,586,78.43,generic_event,False,curl/7.68.0,event for TCK0701001,ℹ️ Logged for monitoring,
3,2025-07-01 01:49:00,DEBUG,careplus.support.GenericService,TCK0701001,sess_TCK0701001,163.214.94.42,878,63.61,generic_event,False,curl/7.68.0,event for TCK0701001,ℹ️ Logged for monitoring,
4,2025-07-01 01:50:00,INFO,careplus.support.GenericService,TCK0701000,sess_TCK0701000,155.68.207.12,1614,87.85,generic_event,False,Python-urllib/3.9,event for TCK0701000,ℹ️ Logged for monitoring,


In [7]:
df = df.drop("trace_id", axis=1)
df.head(2)

Unnamed: 0,timestamp,log_level,component,ticket_id,session_id,ip,response_time,cpu,event_type,error,user_agent,message,debug
0,2025-07-01 00:21:00,INF0,careplus.support.GenericService,TCK0701000,sess_TCK0701000,60.130.155.7,1269,27.64,generic_event,False,PostmanRuntime/7.32.2,event for TCK0701000,ℹ️ Logged for monitoring
1,2025-07-01 00:41:00,INFO,careplus.support.GenericService,TCK0701000,sess_TCK0701000,58.36.189.27,1505,57.24,generic_event,False,Mobile-Safari/537.36,event for TCK0701000,ℹ️ Logged for monitoring


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   timestamp      105 non-null    object
 1   log_level      105 non-null    object
 2   component      105 non-null    object
 3   ticket_id      105 non-null    object
 4   session_id     105 non-null    object
 5   ip             105 non-null    object
 6   response_time  105 non-null    object
 7   cpu            105 non-null    object
 8   event_type     105 non-null    object
 9   error          105 non-null    object
 10  user_agent     105 non-null    object
 11  message        105 non-null    object
 12  debug          105 non-null    object
dtypes: object(13)
memory usage: 10.8+ KB


In [9]:
df = df.astype({
    "response_time": "int",
    "cpu": "float"
})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   timestamp      105 non-null    object 
 1   log_level      105 non-null    object 
 2   component      105 non-null    object 
 3   ticket_id      105 non-null    object 
 4   session_id     105 non-null    object 
 5   ip             105 non-null    object 
 6   response_time  105 non-null    int32  
 7   cpu            105 non-null    float64
 8   event_type     105 non-null    object 
 9   error          105 non-null    object 
 10  user_agent     105 non-null    object 
 11  message        105 non-null    object 
 12  debug          105 non-null    object 
dtypes: float64(1), int32(1), object(11)
memory usage: 10.4+ KB


In [10]:
df['error'] = df['error'].str.lower().map({'true': True, 'false': False})
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce').astype('datetime64[ms]')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   timestamp      105 non-null    datetime64[ms]
 1   log_level      105 non-null    object        
 2   component      105 non-null    object        
 3   ticket_id      105 non-null    object        
 4   session_id     105 non-null    object        
 5   ip             105 non-null    object        
 6   response_time  105 non-null    int32         
 7   cpu            105 non-null    float64       
 8   event_type     105 non-null    object        
 9   error          105 non-null    bool          
 10  user_agent     105 non-null    object        
 11  message        105 non-null    object        
 12  debug          105 non-null    object        
dtypes: bool(1), datetime64[ms](1), float64(1), int32(1), object(9)
memory usage: 9.7+ KB


In [12]:
df.describe()

Unnamed: 0,timestamp,response_time,cpu
count,105,105.0,105.0
mean,2025-07-01 08:21:31.428000,885.52381,54.91819
min,2025-07-01 00:21:00,-1566.0,10.14
25%,2025-07-01 05:38:00,586.0,34.42
50%,2025-07-01 09:13:00,984.0,60.14
75%,2025-07-01 11:12:00,1323.0,73.7
max,2025-07-01 14:10:00,1792.0,89.97
std,,640.849858,22.513155


In [13]:
df = df[df.response_time>=0]
df.describe()

Unnamed: 0,timestamp,response_time,cpu
count,98,98.0,98.0
mean,2025-07-01 08:24:49.591000,1006.5,54.821633
min,2025-07-01 00:21:00,126.0,13.35
25%,2025-07-01 05:39:15,653.25,34.71
50%,2025-07-01 09:15:00,1089.0,59.455
75%,2025-07-01 11:33:00,1327.75,73.2425
max,2025-07-01 14:10:00,1792.0,89.97
std,,447.808944,22.185337


In [14]:
df["log_level"].value_counts()

log_level
INFO       37
DEBUG      32
INF0       13
DEBG       10
Name: count, dtype: int64

In [15]:
fix_log_level = {'INF0': 'INFO', 'DEBG': 'DEBUG', 'warnING': 'WARNING', 'EROR': 'ERROR'}
df['log_level'] = df['log_level'].replace(fix_log_level)
    
df.log_level.value_counts()

log_level
INFO       50
DEBUG      42
Name: count, dtype: int64

In [19]:
df[df.duplicated()]

Unnamed: 0,timestamp,log_level,component,ticket_id,session_id,ip,response_time,cpu,event_type,error,user_agent,message,debug
10,2025-07-01 03:30:00,DEBUG,careplus.support.GenericService,TCK0701002,sess_TCK0701002,36.64.191.144,1223,81.83,generic_event,False,curl/7.68.0,event for TCK0701002,ℹ️ Logged for monitoring
33,2025-07-01 06:26:00,INFO,careplus.support.GenericService,TCK0701009,sess_TCK0701009,30.228.28.191,1253,32.54,generic_event,False,PostmanRuntime/7.32.2,event for TCK0701009,ℹ️ Logged for monitoring
38,2025-07-01 06:43:00,DEBUG,careplus.support.GenericService,TCK0701008,sess_TCK0701008,214.140.181.78,1372,63.43,generic_event,False,Mobile-Safari/537.36,event for TCK0701008,ℹ️ Logged for monitoring
44,2025-07-01 07:57:00,INFO,careplus.support.GenericService,TCK0701018,sess_TCK0701018,167.18.200.246,1454,85.97,generic_event,False,Mozilla/5.0 (Windows NT 10.0),event for TCK0701018,ℹ️ Logged for monitoring
57,2025-07-01 09:32:00,DEBUG,careplus.support.GenericService,TCK0701013,sess_TCK0701013,114.173.55.131,1089,32.7,generic_event,False,PostmanRuntime/7.32.2,event for TCK0701013,ℹ️ Logged for monitoring
59,2025-07-01 09:36:00,DEBUG,careplus.support.GenericService,TCK0701020,sess_TCK0701020,146.157.172.98,808,78.08,generic_event,False,Mozilla/5.0 (Windows NT 10.0),event for TCK0701020,ℹ️ Logged for monitoring
69,2025-07-01 10:17:00,INFO,careplus.support.GenericService,TCK0701019,sess_TCK0701019,163.229.213.118,1568,24.09,generic_event,False,PostmanRuntime/7.32.2,event for TCK0701019,ℹ️ Logged for monitoring
77,2025-07-01 10:45:00,INFO,careplus.support.GenericService,TCK0701023,sess_TCK0701023,30.211.17.103,1127,22.14,generic_event,False,Mozilla/5.0 (Windows NT 10.0),event for TCK0701023,ℹ️ Logged for monitoring
92,2025-07-01 12:25:00,DEBUG,careplus.support.GenericService,TCK0701028,sess_TCK0701028,138.124.89.86,1250,46.43,generic_event,False,Mobile-Safari/537.36,event for TCK0701028,ℹ️ Logged for monitoring


In [20]:
df = df.drop_duplicates()
df[df.duplicated()]

Unnamed: 0,timestamp,log_level,component,ticket_id,session_id,ip,response_time,cpu,event_type,error,user_agent,message,debug


In [21]:
df.shape

(89, 13)