In [1]:
import ast
import re
import io
import json
import gzip
import numpy as np
import pandas as pd
from google.cloud import storage

In [2]:
pd.set_option('max_colwidth', 999999)
pd.options.mode.copy_on_write = True

In [3]:
bucket_name = "chronos-cron-prod"
source = "chronos-2024-02-18.error.log.gz" # gs uri

# Explicitly use service account credentials by specifying the private key file.
storage_client = storage.Client.from_service_account_json('/content/perqara-data-532572ce4996.json')

# Get the bucket and blob objects
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(source)

# Download the contents of the blob as a string
str_data = io.BytesIO(blob.download_as_string())

In [4]:
# Open gzip into csv
with gzip.open(str_data) as gz:
  # Read compressed file as a file object
  file = gz.read()
  # Decode the byte type into string by utf-8
  blob_decompress = file.decode('utf-8')
  # StringIO object
  s = io.StringIO(blob_decompress)

In [5]:
df = pd.read_json(s, dtype={'level': str, 'message': str, 'timestamp': pd.Timestamp}, nrows=999999999, lines=True)

In [6]:
df

Unnamed: 0,level,message,timestamp
0,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 00:04:22
1,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 09:54:20
2,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 10:16:55
3,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 10:42:24
4,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 11:27:12
5,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 11:30:42
6,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 11:36:00
7,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 11:54:14
8,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 11:59:43
9,error,"{'data': {}, 'event': 'LAWYER:JOIN_ROOM-FAILED'}",2024-02-18 12:56:34


In [7]:
# Assuming df is your DataFrame and 'json_column' is the column containing JSON data
normalized_df = pd.json_normalize(df['message'].apply(ast.literal_eval), max_level=1)

In [8]:
normalized_df

Unnamed: 0,event
0,LAWYER:JOIN_ROOM-FAILED
1,LAWYER:JOIN_ROOM-FAILED
2,LAWYER:JOIN_ROOM-FAILED
3,LAWYER:JOIN_ROOM-FAILED
4,LAWYER:JOIN_ROOM-FAILED
5,LAWYER:JOIN_ROOM-FAILED
6,LAWYER:JOIN_ROOM-FAILED
7,LAWYER:JOIN_ROOM-FAILED
8,LAWYER:JOIN_ROOM-FAILED
9,LAWYER:JOIN_ROOM-FAILED


In [9]:
normalized_df.columns

Index(['event'], dtype='object')