In [None]:
import pandas as pd
import numpy as np
import websockets
import json

In [None]:
project_directory = '/data'

In [None]:
URL = "ws://143.110.238.245:8000/stream"

In [None]:
async def listen(url):
  async with websockets.connect(url) as websocket:
    while True:
      message = await websocket.recv()
      yield json.loads(message)

In [None]:
async def take_messages(n=10):
  generator = listen(url=URL)
  messages = []
  async for message in generator:
    messages.append(message)
    if len(messages) >= n:
      break
  return messages

In [None]:
messages = []
while len(messages) < 10_000:
  try:
    print(len(messages), end=", ")
    messages.extend(await take_messages(n=1000))
  except Exception as ex:
    print(ex)

len(messages)

0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 

10000

In [None]:
messages = pd.DataFrame(messages)
messages.shape

(10000, 4)

In [None]:
messages_path = f"{project_directory}/messages_v2.parquet"
messages.to_parquet(messages_path, compression='gzip')

In [None]:
messages = pd.read_parquet(messages_path)
messages.shape

(10000, 4)

In [None]:
messages.sample(5)

Unnamed: 0,user,message,ts,seqid
8088,ubotu,"Para Espaol por favor usen #ubuntu-es, #kubunt...",1704173000.0,448831
3213,dude_,jucato: 4getit,1704172000.0,443951
6105,vge,ic what i can come up to,1704173000.0,446846
1672,dfaure,PA1: the manual solution is to edit /etc/kde4/...,1704172000.0,442407
4403,duane,ok i think it's installing now,1704172000.0,445142


## Quick EDA

In [None]:
messages["datetime"] = pd.to_datetime(messages["ts"], unit="s")

In [None]:
messages["datetime"].sample(5)

Unnamed: 0,datetime
9268,2024-01-02 05:22:27.499151230
7687,2024-01-02 05:19:49.099302292
4782,2024-01-02 05:14:58.299579620
6644,2024-01-02 05:18:04.699401855
9010,2024-01-02 05:22:01.699175835


In [None]:
messages["delay"] = messages["ts"].diff()
messages["delay"].describe() # --> "ts" is uninformative field!

Unnamed: 0,delay
count,9999.0
mean,0.10011
std,0.003315
min,0.1
25%,0.1
50%,0.1
75%,0.1
max,0.2


In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
start_index = np.random.randint(0, len(messages))
messages[["user", "message"]][start_index : start_index + 10]

Unnamed: 0,user,message
8699,cpk1,what kernel does edgy use?
8700,h3sp4wn,gameplayer: search for vim.basic - with apt-file or whatever (which is what you want)
8701,_robert,and than it starts from alone
8702,dom,"cpk1, 2.6.17-10-generic"
8703,h3sp4wn,gameplayer: then sudo update-alternatives --config vim
8704,cpk1,blarg
8705,VitoGirl,i dont have internet connection on that pc
8706,h3sp4wn,gameplayer: if you want it to act like vim then run vim - or vi run vi)
8707,cpk1,first amarok and now I need a newer kernel
8708,_robert,VitoGirl: no problem ssh should be also on the cd


In [None]:
messages["message_n_chars"] = messages["message"].str.len()
messages["message_n_chars"].describe()

Unnamed: 0,message_n_chars
count,10000.0
mean,45.4452
std,44.099194
min,1.0
25%,16.0
50%,34.0
75%,60.0
max,439.0


In [None]:
messages[messages["message_n_chars"] > 200]["message"].sample()

Unnamed: 0,message
2818,"grr this is annoying whenever i extract, for example: test.tar.gz and i choose extract to /test it will put the files in: /test/test/ but i choose extract here it just puts them all right there without the extra /test"


## Prepare the data for labeling

In [None]:
def df2text_v2(df, start_index, n_rows):
  idx = 0
  def stringify(row):
    nonlocal idx
    idx += 1
    return f"{idx} - [{row['user']}] - {row['message']}"
  return "\n".join(df[start_index : start_index + n_rows].apply(stringify, axis=1))

In [None]:
start_index = np.random.randint(0, len(messages))
print(df2text_v2(messages, start_index, n_rows=5))

1 - [me2win] - Chris06: you dont HAVE to format all your drives. Linux can read NTFS, and has write support for it though some programs
2 - [hugelmopf] - ok... on the commandline issue "sudo apt-get dist-upgrade" and check, whether it upgrades your kde.
3 - [me2win] - through the use of some programs that is
4 - [basko] - ok kool thanx man
5 - [Chris06] - ok.. but i really hate windows right now so dont want to use ntfs anymore.. i'm making a backup of all my files on an external drive in fat32 right now


## Data labeling

In [None]:
from openai import OpenAI
# OPENAI_API_KEY = ""

In [None]:
response_format = {
  "type": "json_schema",
  "json_schema": {
      "name": "event_related_ids_schema",
      "schema": {
        "type": "object",
        "properties": {
          "event_related_ids": {
            "type": "array",
            "items": {
              "type": "integer"
            }
          },
        },
        "required": ["event_related_ids"]
      }
  }
}

In [None]:
def gpt(query: str, temperature: float = 0.0):
  client = OpenAI(api_key=OPENAI_API_KEY)
  completion = client.chat.completions.create(
      model="gpt-4o",
      messages=[
          {
              "role": "user",
              "content": query,
          }
      ],
      temperature=temperature,
      response_format=response_format,
  )
  return completion.choices[0].message

In [None]:
def get_data_labeling_prompt_v3(messages_text):
  data_labeling_prompt = f"""Below is a list of messages from a multi-person conversation in a chat application like Slack.
Every line is a message, in ```ID - [user] - message``` format.

```
{messages_text}
```

In the list, there are messages or conversations that can be converted into a calendar event.
For example, a conversation participant may propose a meeting for a specific date, possibly a time, and that involves one or more participants.
Note that these calendar events may span one or more messages; it should not be expected that they will be self contained in one specific message.

Even-related conversations usually appear among a small group of particular users.
Pay attention to participated users.
Pay attention to a mentioned user that an answer is addressed to - a message can start with ```<user-name>:```
Please note that the conversation may not be in English.
Assume that all event-related conversations are dense - all in-the-middle messages belong to this conversation (even if they seem semantically as non-event-related).
It means that every event-related group/conversation must contain all adjacent messages.

Your task is to detect all event-related messages.
Return IDs of only event-related messages, return them in JSON format.

If there is no event-related messages in the list, you answer must be exactly {{'event_related_ids': []}}.

Example of an event-related conversation (format: ```ID - [user] - message```):
```
5 - [hstefan] - hey e_t_, been wrestling with some virtualbox resolution issues
6 - [e_t_] - hstefan: yeah? want to jump on a call and troubleshoot together?
7 - [flavio] - I could join if you’re doing a screen share debugging session
8 - [hstefan] - that sounds great - google meet or discord?
9 - [e_t_] - discord works for me. how about this evening around 8?
10 - [flavio] - +1 for discord, 8pm UTC?
11 - [hstefan] - works for me. I’ll send the invite link
12 - [e_t_] - cool, see you all then
```
And corresponding example of your answer:
{{'event_related_ids': [5, 6, 7, 8, 9, 10, 11, 12]}}
"""
  return data_labeling_prompt

In [None]:
start_index = np.random.randint(0, len(messages))
messages_text = df2text_v2(messages, start_index, n_rows=30)
data_labeling_prompt = get_data_labeling_prompt_v3(messages_text)
print(data_labeling_prompt)

Below is a list of messages from a multi-person conversation in a chat application like Slack.
Every line is a message, in ```ID - [user] - message``` format.

```
1 - [pradeepto] - arn_ just got network on dapper to work with some nice help from this place.
2 - [pradeepto] - arn_: I guess you have ask around
3 - [pradeepto] - arn_: good to know that.
4 - [pradeepto] - so what was the problem? and how did you fix it?
5 - [arn_] - dcopserver wasn't running correctly
6 - [arn_] - couldn't start kde
7 - [arn_] - I just ended up backing up my sources.list file - reinstalling and upgrading
8 - [pradeepto] - arn_ breezy with kde version == ?
9 - [arn_] - dapper with 3.5.2
10 - [pradeepto] - dapper ? w0ah when was that?
11 - [pradeepto] - I though you were on Breezy
12 - [arn_] - nope
13 - [pradeepto] - *though
14 - [pradeepto] - t
15 - [arn_] - never was on breezy
16 - [pradeepto] - hmm interesting
17 - [_patrick_] - du
18 - [speedy4] - ai
19 - [ricardo] - hey guys, im having some problems w

In [None]:
response = gpt(data_labeling_prompt, temperature=0.2)

In [None]:
print(messages_text)
print(f"Event-related message IDs: {response.content}")

1 - [pradeepto] - arn_ just got network on dapper to work with some nice help from this place.
2 - [pradeepto] - arn_: I guess you have ask around
3 - [pradeepto] - arn_: good to know that.
4 - [pradeepto] - so what was the problem? and how did you fix it?
5 - [arn_] - dcopserver wasn't running correctly
6 - [arn_] - couldn't start kde
7 - [arn_] - I just ended up backing up my sources.list file - reinstalling and upgrading
8 - [pradeepto] - arn_ breezy with kde version == ?
9 - [arn_] - dapper with 3.5.2
10 - [pradeepto] - dapper ? w0ah when was that?
11 - [pradeepto] - I though you were on Breezy
12 - [arn_] - nope
13 - [pradeepto] - *though
14 - [pradeepto] - t
15 - [arn_] - never was on breezy
16 - [pradeepto] - hmm interesting
17 - [_patrick_] - du
18 - [speedy4] - ai
19 - [ricardo] - hey guys, im having some problems with my printer. can u help me?
20 - [speedy4] - yes
21 - [kampfschwein] - hello
22 - [ricardo] - I'VE just intalled my printer an epson stylus color 670, using the 

In [None]:
start_index = np.random.randint(0, len(messages))
messages_text = df2text_v2(messages, start_index=start_index, n_rows=30)
data_labeling_prompt = get_data_labeling_prompt_v3(messages_text)
response = gpt(data_labeling_prompt, temperature=0.0)
print(messages_text)
print(f"\nEvent-related message IDs: {response.content}")
event_message_ids = json.loads(response.content)["event_related_ids"]
if event_message_ids:
  print(event_message_ids)
  event_message_ids = messages.iloc[[start_index + idx - 1 for idx in event_message_ids]]["seqid"].tolist()
  print(event_message_ids)

1 - [momal] - AHAH
2 - [Dr_willis] - or just yack with his wife for a few hrs...
3 - [Dr_willis] - :)
4 - [Dr_willis] - hey, that KDE issue was wild earlier. wanna do a screenshare and debug sometime?
5 - [xsacha] - sure, i've got some similar graphics config problems
6 - [soundmaster80] - count me in - i can bring some coffee and troubleshooting skills
7 - [momal] - group video call might be easier than individual screenshares
8 - [Dr_willis] - +1 on that. when works for everyone?
9 - [xsacha] - thursday evening UTC? around 8pm?
10 - [soundmaster80] - works for me! i'll setup a discord or jitsi link
11 - [pa] - hi
12 - [kolin] - what is some good dvd burning software for linux
13 - [pa] - just a quick question
14 - [soundmaster80] - OH DON"T WORRY....that thought has crossed my mind several times
15 - [kolin] - having some problems with k3b
16 - [fingster] - k3b
17 - [Dr_willis] - kolin,  i burn data dvd's with k3b
18 - [kolin] - downloaded a dvd in iso from
19 - [kolin] - saying it i

In [None]:
start_index = 3455  # 7201
messages_text = df2text_v2(messages, start_index=start_index, n_rows=30)
data_labeling_prompt = get_data_labeling_prompt_v3(messages_text)
response = gpt(data_labeling_prompt, temperature=0.0)
print(messages_text)
print(f"\nEvent-related message IDs: {response.content}")
event_message_ids = json.loads(response.content)["event_related_ids"]
if event_message_ids:
  print(event_message_ids)
  event_message_ids = messages.iloc[[start_index + idx - 1 for idx in event_message_ids]]["seqid"].tolist()
  print(event_message_ids)

1 - [momal] - AHAH
2 - [Dr_willis] - or just yack with his wife for a few hrs...
3 - [Dr_willis] - :)
4 - [Dr_willis] - hey, that KDE issue was wild earlier. wanna do a screenshare and debug sometime?
5 - [xsacha] - sure, i've got some similar graphics config problems
6 - [soundmaster80] - count me in - i can bring some coffee and troubleshooting skills
7 - [momal] - group video call might be easier than individual screenshares
8 - [Dr_willis] - +1 on that. when works for everyone?
9 - [xsacha] - thursday evening UTC? around 8pm?
10 - [soundmaster80] - works for me! i'll setup a discord or jitsi link
11 - [pa] - hi
12 - [kolin] - what is some good dvd burning software for linux
13 - [pa] - just a quick question
14 - [soundmaster80] - OH DON"T WORRY....that thought has crossed my mind several times
15 - [kolin] - having some problems with k3b
16 - [fingster] - k3b
17 - [Dr_willis] - kolin,  i burn data dvd's with k3b
18 - [kolin] - downloaded a dvd in iso from
19 - [kolin] - saying it i

In [None]:
messages["event"] = messages["seqid"].apply(lambda x: 1 if x in event_message_ids else 0)

In [None]:
messages[messages["event"] > 0]

Unnamed: 0,user,message,ts,seqid,datetime,delay,message_n_chars,event
3458,Dr_willis,"hey, that KDE issue was wild earlier. wanna do a screenshare and debug sometime?",1704172000.0,444196,2024-01-02 05:12:45.799705982,0.1,80,1
3459,xsacha,"sure, i've got some similar graphics config problems",1704172000.0,444197,2024-01-02 05:12:45.899705887,0.1,52,1
3460,soundmaster80,count me in - i can bring some coffee and troubleshooting skills,1704172000.0,444198,2024-01-02 05:12:45.999705791,0.1,64,1
3461,momal,group video call might be easier than individual screenshares,1704172000.0,444199,2024-01-02 05:12:46.099705696,0.1,61,1
3462,Dr_willis,+1 on that. when works for everyone?,1704172000.0,444200,2024-01-02 05:12:46.199705601,0.1,36,1
3463,xsacha,thursday evening UTC? around 8pm?,1704172000.0,444201,2024-01-02 05:12:46.299705505,0.1,33,1
3464,soundmaster80,works for me! i'll setup a discord or jitsi link,1704172000.0,444202,2024-01-02 05:12:46.399705410,0.1,48,1


In [None]:
window_size = 30
overlap = 5
start = 0
end = len(messages)
event_message_ids = []

for start_index in range(start, end, window_size):
  print(start_index, end=", ")
  messages_text = df2text_v2(messages, start_index=start_index - overlap, n_rows=window_size + overlap*2)
  data_labeling_prompt = get_data_labeling_prompt_v3(messages_text)
  response = gpt(data_labeling_prompt, temperature=0.0)
  if response.content:
    idxs = json.loads(response.content)["event_related_ids"]
    if idxs:
      event_message_ids.extend(
          messages.iloc[[start_index - overlap + idx - 1 for idx in idxs]]["seqid"].tolist()
      )

event_message_ids

0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 630, 660, 690, 720, 750, 780, 810, 840, 870, 900, 930, 960, 990, 1020, 1050, 1080, 1110, 1140, 1170, 1200, 1230, 1260, 1290, 1320, 1350, 1380, 1410, 1440, 1470, 1500, 1530, 1560, 1590, 1620, 1650, 1680, 1710, 1740, 1770, 1800, 1830, 1860, 1890, 1920, 1950, 1980, 2010, 2040, 2070, 2100, 2130, 2160, 2190, 2220, 2250, 2280, 2310, 2340, 2370, 2400, 2430, 2460, 2490, 2520, 2550, 2580, 2610, 2640, 2670, 2700, 2730, 2760, 2790, 2820, 2850, 2880, 2910, 2940, 2970, 3000, 3030, 3060, 3090, 3120, 3150, 3180, 3210, 3240, 3270, 3300, 3330, 3360, 3390, 3420, 3450, 3480, 3510, 3540, 3570, 3600, 3630, 3660, 3690, 3720, 3750, 3780, 3810, 3840, 3870, 3900, 3930, 3960, 3990, 4020, 4050, 4080, 4110, 4140, 4170, 4200, 4230, 4260, 4290, 4320, 4350, 4380, 4410, 4440, 4470, 4500, 4530, 4560, 4590, 4620, 4650, 4680, 4710, 4740, 4770, 4800, 4830, 4860, 4890, 4920, 4950, 4980, 5010, 5040, 5070, 5100, 5130, 5160, 5

[441162,
 441163,
 441164,
 441165,
 441166,
 441167,
 441168,
 441897,
 441898,
 441899,
 441900,
 441901,
 441902,
 441903,
 441904,
 441900,
 441901,
 441902,
 441903,
 441904,
 442947,
 442948,
 442949,
 442950,
 442951,
 442952,
 442953,
 442954,
 442951,
 442952,
 442953,
 442954,
 443732,
 443733,
 443734,
 443735,
 443737,
 443739,
 443734,
 443735,
 443737,
 443739,
 443943,
 443946,
 443951,
 443955,
 443958,
 443964,
 443967,
 443970,
 443972,
 444196,
 444197,
 444198,
 444199,
 444200,
 444201,
 444202,
 444822,
 444823,
 444823,
 444824,
 444825,
 444826,
 444827,
 444828,
 444829,
 445835,
 445836,
 445837,
 445838,
 445839,
 445840,
 445841,
 445835,
 445836,
 445837,
 445838,
 445839,
 445840,
 445841,
 445967,
 445968,
 445969,
 445970,
 445971,
 445972,
 445973,
 446887,
 446888,
 446889,
 446890,
 446891,
 446892,
 446893,
 446894,
 446887,
 446888,
 446889,
 446890,
 446891,
 446892,
 446893,
 446894,
 447636,
 447637,
 447638,
 447639,
 447636,
 447637,
 447638,
 

In [None]:
len(set(event_message_ids))

108

In [None]:
messages["event"] = messages["seqid"].apply(lambda x: 1 if x in event_message_ids else 0)

In [None]:
messages[messages["event"] > 0].shape

(108, 8)

In [None]:
messages["event"].mean()

0.0108

In [None]:
messages = messages.drop(["datetime", "delay", "message_n_chars"], axis=1)

In [None]:
print(messages[messages["event"] > 0].sample()["message"].tolist()[0])

i could join too if you want another perspective


In [None]:
# messages.to_parquet(messages_path, compression='gzip')