# **IMPORTS & PATHS** 

In [43]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil


# Paths and model settings\ nZIP_PATH = 'I-Soon-data.zip'      # Path to your downloaded zip file
ZIP_PATH = '0.zip'    # Directory to extract contents
EXTRACT_DIR = 'I-Soon-data'        # Directory to extract contents
ORGANIZED_DIR = 'organized_data'  # Directory to group files by extension
MODEL_NAME = 'llama3.2'             # Local Ollama model identifier
OUTPUT_JSON = 'parsed_md.json'    # Aggregated JSON output
OUTPUT_CSV = 'parsed_md.csv'      # CSV output for DataFrame

# Initialize Ollama LLM via LangChain
model = OllamaLLM(model=MODEL_NAME)

# **ZIP FILE EXTRACTION**

In [44]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extraction directory 'I-Soon-data' already exists


# **DATA TYPE CATEGORIZATION**

In [45]:
import os
import shutil
import re

# regex to detect any HTML tag
html_re = re.compile(r'<[A-Za-z/][^>]*>')

for root, _, files in os.walk(EXTRACT_DIR):
    for fname in files:
        ext = os.path.splitext(fname)[1].lower().lstrip('.') or 'no_extension'
        src_path = os.path.join(root, fname)

        if ext == 'md':
            # classify Markdown by content into md/html or md/non-html
            with open(src_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            subfolder = 'html' if html_re.search(text) else 'non-html'
            dest_folder = os.path.join(ORGANIZED_DIR, 'md', subfolder)
        else:
            # preserve original extension grouping for non-MD
            dest_folder = os.path.join(ORGANIZED_DIR, ext)

        os.makedirs(dest_folder, exist_ok=True)
        dst_path = os.path.join(dest_folder, fname)
        shutil.copy2(src_path, dst_path)

print(f"Organized files under '{ORGANIZED_DIR}':\n"
      " • Markdown with HTML → md/html/\n"
      " • Markdown without HTML → md/non-html/\n"
      " • Other extensions → <extension>/")

Organized files under 'organized_data':
 • Markdown with HTML → md/html/
 • Markdown without HTML → md/non-html/
 • Other extensions → <extension>/


# **MD DATA TRANSFORMATIO TO JSON**

In [46]:
import os
import json
import shutil
from docling.document_converter import DocumentConverter

# Initialize converter
converter = DocumentConverter()

# Define input/output directories
input_dir = "organized_data/md/html"
output_dir = "html_markdown_to_json"
failed_dir = "failed_parsing"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(failed_dir, exist_ok=True)

# Process each Markdown file
for fname in os.listdir(input_dir):
    if fname.endswith(".md"):
        input_path = os.path.join(input_dir, fname)
        
        try:
            result = converter.convert(input_path)
            doc_dict = result.document.model_dump()
            
            # Output filename
            base = os.path.splitext(fname)[0]
            output_path = os.path.join(output_dir, f"{base}_docling.json")
            
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(doc_dict, f, indent=2)
            
            print(f"✅ Converted: {fname} → {output_path}")
        
        except Exception as e:
            # Move to failed folder
            shutil.move(input_path, os.path.join(failed_dir, fname))
            print(f"❌ Failed to convert {fname}: {e} (moved to {failed_dir})")

✅ Converted: dbc9c90e-a3e6-4d71-bb93-5fb8394095ac.md → html_markdown_to_json/dbc9c90e-a3e6-4d71-bb93-5fb8394095ac_docling.json
✅ Converted: 28.md → html_markdown_to_json/28_docling.json
✅ Converted: 5.md → html_markdown_to_json/5_docling.json
✅ Converted: 38.md → html_markdown_to_json/38_docling.json
✅ Converted: 9d7bc879-3250-4013-ac04-5ff9bd6dff40.md → html_markdown_to_json/9d7bc879-3250-4013-ac04-5ff9bd6dff40_docling.json
✅ Converted: 18.md → html_markdown_to_json/18_docling.json
❌ Failed to convert ._3.md: 'utf-8' codec can't decode byte 0xfb in position 37: invalid start byte (moved to failed_parsing)
✅ Converted: 9fd06037-11f1-4ad5-9a7d-cbfb3fa4193b.md → html_markdown_to_json/9fd06037-11f1-4ad5-9a7d-cbfb3fa4193b_docling.json
✅ Converted: 3348953d-66e9-4cac-8675-65bb5f2ef929.md → html_markdown_to_json/3348953d-66e9-4cac-8675-65bb5f2ef929_docling.json
✅ Converted: 1.md → html_markdown_to_json/1_docling.json
✅ Converted: 19.md → html_markdown_to_json/19_docling.json
✅ Converted: 07f

# **JSON TO DATAFRAME WITH AI**
# *Here we use AI to analyze all of the JSON files and identify which headers are usefull to keep. We store all the identified headers in a dictionary with the name of the json file as key and value the list contaning the headers*

In [50]:
import os
import json
import ast  # To safely evaluate header string to list

# Folder path
folder_path = "html_markdown_to_json"

# Store headers proposed for each file
file_headers = {}

# Loop through each file in the directory
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                doc_json = json.load(f)

            # Safely navigate the structure
            tables = doc_json.get("tables", [])
            if not tables:
                print(f"Skipped {filename}: no tables found.")
                continue

            cells = tables[0].get("data", {}).get("table_cells", [])
            sample_cells = cells[:20]  # Use a subset to limit input size

            # Format as JSON string
            cell_str = json.dumps(sample_cells, indent=2, ensure_ascii=False)

            # Prepare the prompt
            prompt = f"""
You are a law enforcement officer that specializes in investigating cybersecurity data leaks. Analyze the following JSON table cell data and infer what column headers would be appropriate
if this were converted into a pandas DataFrame. 

Only return a list of column names that are relevant to the data. Do not include any other text or explanation.

JSON data:
{cell_str}
"""

            # Get headers from the model
            headers_response = model(prompt)

            # Convert stringified list to actual Python list safely
            try:
                headers_list = ast.literal_eval(headers_response.strip())
                if isinstance(headers_list, list):
                    file_headers[filename] = headers_list
                else:
                    print(f"Warning: Invalid header format in {filename}")
            except Exception as eval_err:
                print(f"Header parsing error in {filename}: {eval_err}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Print the headers dictionary
for fname, headers in file_headers.items():
    print(f"{fname}: {headers}")

# Print the headers dictionary
print("\\n --- Headers Dicitonary --- \\n")
print(file_headers)

Skipped 9fe6b262-9944-417d-a0c4-9f2de1de2994_docling.json: no tables found.
Skipped 6d7fc7b3-c892-4cb5-bd4b-a5713c089d88_docling.json: no tables found.
Skipped 912204cb-8ab7-48b8-9abf-d803f3804d08_docling.json: no tables found.
Skipped b3031e66-40b6-45e8-9bcd-891dc1a280da_docling.json: no tables found.
Skipped dbc9c90e-a3e6-4d71-bb93-5fb8394095ac_docling.json: no tables found.
Skipped eda5b003-9250-4913-b724-74cca86240af_docling.json: no tables found.
Skipped 12756724-394c-4576-b373-7c53f1abbd94_docling.json: no tables found.
Skipped 5a6b122c-39c1-4581-8c1f-2d6f36a9f8a0_docling.json: no tables found.
Skipped 547aba02-6757-49c1-acb5-6df217cebfc7_docling.json: no tables found.
Skipped 54990932-71af-48dd-9a7a-2617b1407c54_docling.json: no tables found.
Skipped 585875ff-f8c5-4a02-acd7-fef37dc9ff11_docling.json: no tables found.
Skipped fe245192-1f9c-4f28-9b32-046fb7ce7e1e_docling.json: no tables found.
Skipped 3f451a52-d210-48d9-b56e-d28b9570bdc4_docling.json: no tables found.
Skipped 2db2

# **LLM ANALYSIS OF THE DICTIONARY OF HEADERS FROM JSON**
# *Here we use the LLM model's capability to analyze the dicitonary of headers and then use the most relevant headers*

In [None]:
import json

# Convert the dictionary to a readable string
headers_str = json.dumps(file_headers, indent=2, ensure_ascii=False)

# Prompt the LLM to synthesize the best header list
prompt = f"""
You are a forensic data analyst reviewing metadata extracted from structured JSON communication logs.

The dictionary below maps filenames to lists of inferred DataFrame column headers. Your task is to review all these header lists and decide on a **single, unified list** of column names that would best represent the core information relevant to a cybersecurity investigation.

The list should be concise, semantically meaningful, and avoid redundant or purely structural fields unless they are essential for analysis.

Return only the final list in valid Python list syntax (e.g., ["Time", "From", "To", "Message"]) with no explanation.

Header dictionary:
{headers_str}
"""

# Send prompt to model
final_headers = model(prompt)

# Print the result
print("Final consolidated DataFrame headers:")
print(final_headers.strip())

Final consolidated DataFrame headers:
["Time", "From", "To", "Message"]


# **HERE WE CREATE A DATAFRAME FOR EVERY JSON FILE IN THE FOLDER HTML_MARKDOWN_TO_JSON**

In [53]:
import os
import json
import pandas as pd

# Folder with JSON files
folder_path = "html_markdown_to_json"

# Final headers from LLM
headers_response = ["Time", "From", "To", "Message"]

# Track DataFrame names
loaded_dataframe_names = []

# Loop through all JSON files
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        json_path = os.path.join(folder_path, filename)
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                doc = json.load(f)

            tables = doc.get("tables", [])
            if not tables:
                continue

            for i, table in enumerate(tables):
                cells = table.get("data", {}).get("table_cells", [])

                # Organize by row based on row indices
                rows_dict = {}
                for cell in cells:
                    row_idx = cell.get("start_row_offset_idx")
                    col_idx = cell.get("start_col_offset_idx")
                    text = cell.get("text", "")
                    if row_idx is not None and col_idx is not None:
                        if row_idx not in rows_dict:
                            rows_dict[row_idx] = {}
                        rows_dict[row_idx][col_idx] = text

                # Convert row dict to sorted list of rows
                sorted_rows = []
                for row_idx in sorted(rows_dict.keys()):
                    row = rows_dict[row_idx]
                    max_col = max(row.keys()) if row else -1
                    row_list = [row.get(col, "") for col in range(max_col + 1)]
                    sorted_rows.append(row_list)

                if not sorted_rows:
                    continue

                # Truncate headers to match the number of columns in the data
                header_cols = headers_response[:len(sorted_rows[0])]
                df = pd.DataFrame(sorted_rows, columns=header_cols)

                df_name = f"{filename} - table {i}"
                loaded_dataframe_names.append(df_name)

                print(f"\n=== {df_name} (shape: {df.shape}) ===")
                display(df)

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Print names of all successfully loaded DataFrames
print("\n=== Loaded DataFrame Names ===")
for name in loaded_dataframe_names:
    print(name)


=== 28_docling.json - table 0 (shape: (47, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-08-11 05:51:47,wxid_xusilpfkh31g21,Shutd0wn,吴总，现在方便吗，想和你通个电话，有个事情汇报一下
2,2022-08-11 05:51:59,Shutd0wn,wxid_xusilpfkh31g21,嗯
3,2022-08-11 05:52:40,Shutd0wn,wxid_xusilpfkh31g21,？
4,2022-08-11 06:29:08,wxid_xusilpfkh31g21,Shutd0wn,花雨夜和! 4p47hy的聊天记录
5,2022-08-11 06:33:23,Shutd0wn,wxid_xusilpfkh31g21,嗯，那他也不知道
6,2022-08-11 06:38:04,wxid_xusilpfkh31g21,Shutd0wn,是的
7,2022-08-11 07:46:23,Shutd0wn,wxid_xusilpfkh31g21,她说了哪些信息点让你觉得知道的太多了，我评估评估
8,2022-08-11 07:51:14,wxid_xusilpfkh31g21,Shutd0wn,1、规定这样的报销制度的原因到底是如何：如是公司资金紧张而不发，那为什么还要组织团建，而且团...
9,2022-08-11 07:52:08,wxid_xusilpfkh31g21,Shutd0wn,2、奇安信之前说要入股，因为20年没法年终奖导致人心涣散，最终很多核心人员被奇安信挖走，然后...



=== 19_docling.json - table 0 (shape: (22, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-11-30 02:58:46,ibabaimama,wxid_12n748um1thl21,客户看了，对菲关注度不大，能搞越南或缅甸不？
2,2021-11-30 02:59:51,wxid_12n748um1thl21,ibabaimama,好的我问问
3,2021-11-30 06:02:07,wxid_12n748um1thl21,ibabaimama,我目前只掌握啦越南社科院的
4,2021-11-30 06:02:23,wxid_12n748um1thl21,ibabaimama,[破涕为笑]鸡肋部门
5,2021-11-30 06:21:07,ibabaimama,wxid_12n748um1thl21,[捂脸]看看电信的能搞不，vitta
6,2021-11-30 06:22:03,wxid_12n748um1thl21,ibabaimama,嗯，之前进去过
7,2021-11-30 06:22:12,wxid_12n748um1thl21,ibabaimama,第二天就被扫出来了
8,2021-11-30 06:22:21,wxid_12n748um1thl21,ibabaimama,我好熟悉这个名字
9,2021-11-30 06:22:25,wxid_12n748um1thl21,ibabaimama,[呲牙]



=== 29_docling.json - table 0 (shape: (81, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2019-07-23 05:54:00,wxid_xusilpfkh31g21,hack05112,你的报销怎么直接发给鸭总审批了
2,2019-07-23 05:56:58,hack05112,wxid_xusilpfkh31g21,我一直都是他批啊，费用的都是他批，出差的都是C总批
3,2019-07-23 05:57:10,hack05112,wxid_xusilpfkh31g21,你才注意啊
4,2019-07-23 05:59:10,wxid_xusilpfkh31g21,hack05112,为啥搞这么麻烦
...,...,...,...,...
76,2019-07-23 06:31:48,hack05112,wxid_xusilpfkh31g21,就是说我们今年的这个
77,2019-07-23 06:31:50,hack05112,wxid_xusilpfkh31g21,8月份开
78,2019-07-23 06:32:02,wxid_xusilpfkh31g21,hack05112,好吧，今年晚了
79,2019-07-23 06:32:06,wxid_xusilpfkh31g21,hack05112,往年都是5月份



=== 18_docling.json - table 0 (shape: (85, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-06-12 13:35:10,wxid_zb45i0rc71yk21,wxid_12n748um1thl21,看来灰灰去销售那边儿的油水还是比较好
2,2021-06-12 13:44:51,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,？？？
3,2021-06-12 13:45:00,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,钱还你
4,2021-06-12 13:45:13,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,了
...,...,...,...,...
80,2021-06-12 13:59:11,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,东西稳定
81,2021-06-12 13:59:16,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,交了他们自己研究
82,2021-06-12 13:59:27,wxid_zb45i0rc71yk21,wxid_12n748um1thl21,Jd的都这样
83,2021-06-12 13:59:31,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,嗯



=== 15_docling.json - table 0 (shape: (200, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-11-30 04:18:43,wxid_zb45i0rc71yk21,wxid_12n748um1thl21,
2,2021-11-30 05:59:08,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,有没有越南缅甸的qb
3,2021-11-30 05:59:18,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,好好找找
4,2021-11-30 05:59:29,wxid_zb45i0rc71yk21,wxid_12n748um1thl21,没有
...,...,...,...,...
195,2021-12-06 02:27:38,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,急的跳脚
196,2021-12-06 02:27:41,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,东西呢
197,2021-12-06 10:46:22,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,人呢
198,2021-12-06 10:46:27,wxid_12n748um1thl21,wxid_zb45i0rc71yk21,你大爷的



=== 40_docling.json - table 0 (shape: (130, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-03-15 15:24:52,wxid_icges6alg8cl21,wxid_kbys0kvzj4ta12,谢总
2,2022-03-15 15:25:34,wxid_kbys0kvzj4ta12,wxid_icges6alg8cl21,？
3,2022-03-15 15:26:00,wxid_icges6alg8cl21,wxid_kbys0kvzj4ta12,最近公司咋样啊
4,2022-03-15 15:26:03,wxid_icges6alg8cl21,wxid_kbys0kvzj4ta12,忙啥子哦
...,...,...,...,...
125,2022-03-15 16:37:14,wxid_icges6alg8cl21,wxid_kbys0kvzj4ta12,现在有啥拿的出手的
126,2022-03-15 16:41:01,wxid_kbys0kvzj4ta12,wxid_icges6alg8cl21,公司产品就是废品 但是又不能没有 总得搞些产品去假装签合同或者去忽悠 现在三部分都是卖服务 ...
127,2022-03-15 18:37:57,wxid_icges6alg8cl21,wxid_kbys0kvzj4ta12,https://zhuanlan.zhihu.com/p/435968069
128,2022-03-15 18:38:00,wxid_icges6alg8cl21,wxid_kbys0kvzj4ta12,公司搞这个吧



=== 12_docling.json - table 0 (shape: (393, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-03-02 06:54:15,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,你辞职了吗？
2,2021-03-02 06:54:45,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,还是继续挂个身份 跑湖南
3,2021-03-02 07:09:53,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,湖南估计也保不住了
4,2021-03-02 07:10:07,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,保不住是啥意思
...,...,...,...,...
388,2021-03-10 12:29:51,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,毛利？
389,2021-03-10 12:30:02,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,对
390,2021-03-10 12:30:06,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,跟你们一样
391,2021-03-10 12:30:14,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,只是点少点儿



=== 24_docling.json - table 0 (shape: (56, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-04-25 09:58:01,wxid_7p054rmzkhqf21,wxid_5390224027312,
2,2022-04-28 04:24:45,wxid_5390224027312,wxid_7p054rmzkhqf21,陆总，这个我问了团队那边，没有现成的，可以做
3,2022-04-28 04:25:23,wxid_7p054rmzkhqf21,wxid_5390224027312,可以做是价格有没有大概的预算
4,2022-04-28 04:26:29,wxid_5390224027312,wxid_7p054rmzkhqf21,没有，他是新团队，刚找招的人
5,2022-04-28 04:26:38,wxid_5390224027312,wxid_7p054rmzkhqf21,
6,2022-04-28 04:26:46,wxid_5390224027312,wxid_7p054rmzkhqf21,看你们这边现在的一个情况
7,2022-04-28 04:26:52,wxid_5390224027312,wxid_7p054rmzkhqf21,有没有经费预算，有多少
8,2022-04-28 04:26:55,wxid_5390224027312,wxid_7p054rmzkhqf21,时间急不急
9,2022-04-28 04:27:08,wxid_7p054rmzkhqf21,wxid_5390224027312,不算很着急，可以慢慢做



=== 23_docling.json - table 0 (shape: (25, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2023-01-09 02:28:07,wxid_hlmnhsq64tt722,wxid_12n748um1thl21,
2,2023-01-09 02:28:14,wxid_hlmnhsq64tt722,wxid_12n748um1thl21,等一下 平台有点问题
3,2023-01-09 02:28:18,wxid_12n748um1thl21,wxid_hlmnhsq64tt722,好的
4,2023-01-09 02:36:19,wxid_hlmnhsq64tt722,wxid_12n748um1thl21,https://74.120.172.10:10092/home
5,2023-01-09 02:36:25,wxid_hlmnhsq64tt722,wxid_12n748um1thl21,access OrFRXV LZtestUser lzqzmp@123
6,2023-01-09 02:43:51,wxid_12n748um1thl21,wxid_hlmnhsq64tt722,演示视屏发一个
7,2023-01-09 02:44:06,wxid_12n748um1thl21,wxid_hlmnhsq64tt722,这个资料都不用给了
8,2023-01-09 02:44:09,wxid_12n748um1thl21,wxid_hlmnhsq64tt722,[呲牙]
9,2023-01-09 02:44:20,wxid_hlmnhsq64tt722,wxid_12n748um1thl21,这是微软的试用版



=== 6_docling.json - table 0 (shape: (137, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2020-08-24 01:12:37,ken73224,wxid_hlmnhsq64tt722,我说湖州那里约好了跟我说一下
2,2020-08-24 01:52:54,wxid_hlmnhsq64tt722,ken73224,周四
3,2020-08-24 01:53:02,wxid_hlmnhsq64tt722,ken73224,厦门合同发过来了
4,2020-08-24 01:53:33,ken73224,wxid_hlmnhsq64tt722,湖州约的周四吧
...,...,...,...,...
132,2020-08-25 06:54:43,ken73224,wxid_hlmnhsq64tt722,嗯，飞了
133,2020-08-25 06:55:11,wxid_hlmnhsq64tt722,ken73224,一个月了 天天给我打电话
134,2020-08-25 06:55:32,wxid_hlmnhsq64tt722,ken73224,我就为了过个开票
135,2020-08-25 06:56:22,ken73224,wxid_hlmnhsq64tt722,嗯



=== 1_docling.json - table 0 (shape: (548, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2020-08-01 01:44:05,lengmo,Shutd0wn,
2,2020-08-01 01:44:39,lengmo,Shutd0wn,云境
3,2020-08-01 01:51:15,Shutd0wn,lengmo,[Grin]
4,2020-08-01 01:53:08,lengmo,Shutd0wn,靶场是个大趋势
...,...,...,...,...
543,2020-08-29 04:13:03,Shutd0wn,lengmo,2点开个网络会议，和tb他们
544,2020-08-29 04:13:10,Shutd0wn,lengmo,你留好时间
545,2020-08-29 04:14:18,Shutd0wn,lengmo,关于12培训的事
546,2020-08-29 04:26:11,lengmo,Shutd0wn,嗯



=== 36_docling.json - table 0 (shape: (200, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-06-13 01:55:11,wxid_c9yv0nsla3yn22,wxid_zb45i0rc71yk21,扬州那边个人pc通道，今天能给他们吗[捂脸]
2,2022-06-13 02:40:45,wxid_zb45i0rc71yk21,wxid_c9yv0nsla3yn22,可以
3,2022-06-13 02:41:11,wxid_c9yv0nsla3yn22,wxid_zb45i0rc71yk21,可以的
4,2022-06-13 02:41:22,wxid_c9yv0nsla3yn22,wxid_zb45i0rc71yk21,听说又不换了的嘛
...,...,...,...,...
195,2022-06-13 10:34:56,wxid_c9yv0nsla3yn22,wxid_zb45i0rc71yk21,[捂脸]
196,2022-06-13 10:34:57,wxid_zb45i0rc71yk21,wxid_c9yv0nsla3yn22,等客户确定要弄了
197,2022-06-13 10:34:58,wxid_c9yv0nsla3yn22,wxid_zb45i0rc71yk21,嗯可以的
198,2022-06-13 10:35:06,wxid_zb45i0rc71yk21,wxid_c9yv0nsla3yn22,我就要去跟他好好谈这事了



=== 31_docling.json - table 0 (shape: (45, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-09-11 04:46:20,gzp1991101,yanzi542766277,早上老板问了些啥哇[旺柴]
2,2022-09-11 06:36:52,yanzi542766277,gzp1991101,喊说我说自己下半年的项目情况 ，有没有交付困难
3,2022-09-11 06:36:58,yanzi542766277,gzp1991101,其实我几分钟就说完了
4,2022-09-11 06:37:18,gzp1991101,yanzi542766277,这些不都汇报了好多次了
5,2022-09-11 06:37:23,yanzi542766277,gzp1991101,然后说了一下，有交付困难的
6,2022-09-11 06:38:06,gzp1991101,yanzi542766277,就吴总和C总啊
7,2022-09-11 06:38:26,yanzi542766277,gzp1991101,还有王宴，张凤，王总，李正侠
8,2022-09-11 06:42:18,gzp1991101,yanzi542766277,然后拉着你说了好久啊
9,2022-09-11 06:43:57,yanzi542766277,gzp1991101,不是



=== 13_docling.json - table 0 (shape: (245, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-11-25 10:06:17,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,沛哥，手里还有越南的东西吗？
2,2021-11-25 10:06:24,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,可以更新的
3,2021-11-25 10:07:03,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,不可以更新
4,2021-11-25 10:07:14,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,越南社科院可以更新
...,...,...,...,...
240,2021-12-20 07:22:35,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,有几个字段和起止时间在确认
241,2021-12-20 14:16:26,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,SD投标书.zip
242,2021-12-20 14:16:31,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,密码你手机
243,2021-12-20 14:17:21,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,商务部分，能做的我都做了。有些东西让老赵确认一下加不加，格式里面我留了的，不需要就删除了



=== 41_docling.json - table 0 (shape: (323, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-09-27 07:14:01,lengmo,wxid_5390224027312,你们那边现在咋说
2,2022-09-27 07:14:06,lengmo,wxid_5390224027312,直接周五走么
3,2022-09-27 07:15:08,wxid_5390224027312,lengmo,现在大家都在讨论咋整，他们意思是喊等到国庆后8号，让大家缓一下，但是大概率协调不下来
4,2022-09-27 07:15:37,lengmo,wxid_5390224027312,大家情绪咋样
...,...,...,...,...
318,2023-01-11 03:02:35,lengmo,wxid_5390224027312,投标的时候，就用的我给的参数？
319,2023-01-11 03:02:57,wxid_5390224027312,lengmo,嗯
320,2023-01-11 03:06:19,wxid_5390224027312,lengmo,招标的时候只有产品名称，没有具体参数
321,2023-01-11 03:06:28,wxid_5390224027312,lengmo,投标的时候直接就用的你发的这个



=== 14_docling.json - table 0 (shape: (2, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-11-25 08:47:37,wxid_blw54o1q0q5w22,wxid_zb45i0rc71yk21,利来国际：\nhttps://gd9189.com/deposit\n15096047853...



=== 7_docling.json - table 0 (shape: (197, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-01-10 07:15:44,wxid_7p054rmzkhqf21,gzp1991101,你那个，别人说搞点小样看一下
2,2022-01-10 07:18:54,gzp1991101,wxid_7p054rmzkhqf21,数据小样.7z
3,2022-01-10 07:18:59,gzp1991101,wxid_7p054rmzkhqf21,万里长城万里长@20220110
4,2022-01-10 07:36:32,gzp1991101,wxid_7p054rmzkhqf21,
...,...,...,...,...
192,2022-01-21 04:44:29,wxid_7p054rmzkhqf21,gzp1991101,龚总
193,2022-01-21 04:56:47,gzp1991101,wxid_7p054rmzkhqf21,小胜
194,2022-01-21 04:57:18,wxid_7p054rmzkhqf21,gzp1991101,这不请我们蹦迪庆祝一下
195,2022-01-21 05:16:05,gzp1991101,wxid_7p054rmzkhqf21,？



=== 30_docling.json - table 0 (shape: (323, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-04-02 04:04:43,gzp1991101,wxid_mgh25nentc4u22,样本.7z
2,2021-04-02 04:05:00,gzp1991101,wxid_mgh25nentc4u22,密码跟昨天的一样
3,2021-04-02 11:39:16,wxid_mgh25nentc4u22,gzp1991101,龚哥，我自闭了
4,2021-04-02 11:39:22,wxid_mgh25nentc4u22,gzp1991101,我打算顺其自然
...,...,...,...,...
318,2022-07-12 11:56:48,wxid_mgh25nentc4u22,gzp1991101,以后来还长，认识你们是更宝贵的财富
319,2022-07-12 11:56:50,gzp1991101,wxid_mgh25nentc4u22,[捂脸]
320,2022-07-12 11:57:24,gzp1991101,wxid_mgh25nentc4u22,来日方长
321,2022-07-12 11:57:27,wxid_mgh25nentc4u22,gzp1991101,对



=== 37_docling.json - table 0 (shape: (13, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-03-05 11:03:10,lengmo,just910420,咋样？现在提成这些提高了嘛
2,2021-03-05 11:03:42,just910420,lengmo,不知道，伟哥说的下周一告诉我结果。
3,2021-03-05 11:03:53,lengmo,just910420,啥结果
4,2021-03-05 11:04:11,just910420,lengmo,我不知道谈了什么
5,2021-03-05 11:04:17,just910420,lengmo,伟哥说下周一告诉我
6,2021-03-05 11:04:20,just910420,lengmo,[Facepalm][Facepalm][Facepalm]
7,2021-03-05 11:04:24,lengmo,just910420,就是提出的一些问题
8,2021-03-05 11:04:35,lengmo,just910420,售前比例提成加高
9,2021-03-05 11:05:53,just910420,lengmo,谢谢C总\n我这边周末也在权衡考虑下吧。



=== 22_docling.json - table 0 (shape: (92, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-09 03:17:39,wxid_7p054rmzkhqf21,SWEET5683yao,https://t.wss.ink/f/89rr3wno4i5 复制链接到浏览器打开
2,2022-05-09 03:17:52,wxid_7p054rmzkhqf21,SWEET5683yao,重新搞了一些小样，量多了一些，密码不变
3,2022-05-09 03:18:33,SWEET5683yao,wxid_7p054rmzkhqf21,好
4,2022-05-09 03:19:11,wxid_7p054rmzkhqf21,SWEET5683yao,北约有兴趣么
...,...,...,...,...
87,2022-07-07 07:17:49,wxid_7p054rmzkhqf21,SWEET5683yao,盖章工作日都要两天，主要是sm ，非工作日更加来不及
88,2022-07-07 07:19:15,SWEET5683yao,wxid_7p054rmzkhqf21,[捂脸]不知道为啥你们是寄的，其他公司都是通知现场去拿的
89,2022-07-07 07:19:50,wxid_7p054rmzkhqf21,SWEET5683yao,而且联系人还不是我，联系我们公司其他人，他们搞了半天才转给我
90,2022-07-07 07:20:00,wxid_7p054rmzkhqf21,SWEET5683yao,我今天才知道[苦涩]



=== 16_docling.json - table 0 (shape: (150, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-01-27 06:02:00,wxid_70w3p1jin84k22,wxid_nv9bv435fz3722,之前听你说过，年终和提薪不满意，是有可能要走？
2,2022-01-27 06:02:04,wxid_nv9bv435fz3722,wxid_70w3p1jin84k22,多半走了吧
3,2022-01-27 06:02:28,wxid_70w3p1jin84k22,wxid_nv9bv435fz3722,都不再看看么
4,2022-01-27 06:02:32,wxid_nv9bv435fz3722,wxid_70w3p1jin84k22,没什么说的了
...,...,...,...,...
145,2022-02-11 03:20:55,wxid_nv9bv435fz3722,wxid_70w3p1jin84k22,我还要给新人培训？疯了
146,2022-02-11 03:21:03,wxid_70w3p1jin84k22,wxid_nv9bv435fz3722,但是找到人了，也许就可以提前走
147,2022-02-11 03:21:16,wxid_nv9bv435fz3722,wxid_70w3p1jin84k22,现在好像就是在招
148,2022-02-11 03:21:19,wxid_70w3p1jin84k22,wxid_nv9bv435fz3722,说让我等着给新人交接



=== 11_docling.json - table 0 (shape: (126, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-05-27 06:40:45,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,bz.zip
2,2021-05-27 06:40:47,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,万里
3,2021-05-27 06:40:52,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,棒子wj
4,2021-05-27 06:49:26,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,ok
...,...,...,...,...
121,2021-06-24 06:49:06,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,上面端口被占用 一直连着 ip是成都 郫县
122,2021-06-24 06:49:16,wxid_hlmnhsq64tt722,wxid_zb45i0rc71yk21,[破涕为笑]
123,2021-06-24 07:01:38,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,牛逼
124,2021-06-24 07:02:16,wxid_zb45i0rc71yk21,wxid_hlmnhsq64tt722,这TM谁干的



=== 35_docling.json - table 0 (shape: (197, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2020-11-25 02:24:13,wxid_soekgggwnfgm21,wxid_5390224027312,娟，你发给我那个11111香港的哪些可以用呢？
2,2020-11-25 02:24:28,wxid_5390224027312,wxid_soekgggwnfgm21,除了那几个大学
3,2020-11-25 02:24:31,wxid_5390224027312,wxid_soekgggwnfgm21,哪里要
4,2020-11-25 02:24:57,wxid_soekgggwnfgm21,wxid_5390224027312,河源国保
...,...,...,...,...
192,2020-11-27 09:02:04,wxid_soekgggwnfgm21,wxid_5390224027312,链接：https://pan.baidu.com/s/1OngMYY5nldSdgxL5yy...
193,2020-12-01 05:15:47,wxid_5390224027312,wxid_soekgggwnfgm21,我给你发了一个红包，赶紧去拆!
194,2020-12-01 06:21:14,wxid_soekgggwnfgm21,wxid_5390224027312,
195,2020-12-01 08:15:10,wxid_5390224027312,wxid_soekgggwnfgm21,琬茹，他们那边有没有预订信息，就是那种发给客人的



=== 32_docling.json - table 0 (shape: (87, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-06-28 06:39:30,nullroot,tianyi-0608,萍姐
2,2022-06-28 06:39:35,tianyi-0608,nullroot,在
3,2022-06-28 06:39:46,nullroot,tianyi-0608,济南利华瑞特信息技术有限公司
4,2022-06-28 06:39:54,nullroot,tianyi-0608,这家公司，跟我们签订过合同没
...,...,...,...,...
82,2022-06-28 12:55:21,nullroot,tianyi-0608,眼线挺多啊[强][强]
83,2022-06-28 12:56:08,tianyi-0608,nullroot,[尴尬]
84,2022-06-28 12:56:21,tianyi-0608,nullroot,木有的好伐
85,2022-06-28 12:56:25,tianyi-0608,nullroot,碰巧知道的



=== 5_docling.json - table 0 (shape: (18, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-12 08:21:13,wangchao953541,wxid_7p054rmzkhqf21,北约的有没给过苏州哦
2,2022-05-12 08:21:53,wxid_7p054rmzkhqf21,wangchao953541,应该没有吧～不过不确定，我重庆客户说有一部分材料以前也看过
3,2022-05-12 08:22:08,wangchao953541,wxid_7p054rmzkhqf21,我给的南京
4,2022-05-12 08:22:14,wangchao953541,wxid_7p054rmzkhqf21,南京说 是不是给过苏州
5,2022-05-12 08:22:24,wxid_7p054rmzkhqf21,wangchao953541,我们这里没有
6,2022-05-12 08:22:29,wangchao953541,wxid_7p054rmzkhqf21,OK
7,2022-05-12 09:45:30,wxid_7p054rmzkhqf21,wangchao953541,超
8,2022-05-12 09:45:37,wxid_7p054rmzkhqf21,wangchao953541,北约那个不用推了
9,2022-05-12 09:45:48,wxid_7p054rmzkhqf21,wangchao953541,刚说的，和北京谈好了，准备出货了



=== 2_docling.json - table 0 (shape: (501, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-09 07:55:04,wxid_7p054rmzkhqf21,gzp1991101,能不能发一个近一月“外交部东盟司”和“卫生局”的
2,2022-05-09 07:55:08,wxid_7p054rmzkhqf21,gzp1991101,他们关心这两个
3,2022-05-09 08:02:05,gzp1991101,wxid_7p054rmzkhqf21,好吧
4,2022-05-09 08:32:30,gzp1991101,wxid_7p054rmzkhqf21,泰的哈？
...,...,...,...,...
496,2022-07-16 04:32:13,wxid_7p054rmzkhqf21,gzp1991101,嗯嗯
497,2022-07-16 04:32:47,gzp1991101,wxid_7p054rmzkhqf21,卖呗
498,2022-07-16 04:33:36,wxid_7p054rmzkhqf21,gzp1991101,到时候我问问，现在一周大概有多少量
499,2022-07-16 04:33:59,gzp1991101,wxid_7p054rmzkhqf21,



=== 27_docling.json - table 0 (shape: (19, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-17 06:04:33,wxid_zbytkn4qjl3r22,25713010771@chatroom,嵩哥@蓝胖子 ，名字还有变化没有，不然就报批了
2,2022-05-17 06:04:35,wxid_zbytkn4qjl3r22,25713010771@chatroom,渗透测试三组——魁杓攻防研究实验室，\n 魁杓，指北斗七星，出自《淮南子·天文训...
3,2022-05-17 06:05:35,snipersk,25713010771@chatroom,没变化啊，李老板润色的好
4,2022-05-17 06:05:52,wxid_zbytkn4qjl3r22,25713010771@chatroom,没有没有，我只是粘在一起
5,2022-05-17 06:16:08,wxid_zbytkn4qjl3r22,25713010771@chatroom,恭喜嵩哥，三个老板们一致盛赞，恭喜魁杓落成
6,2022-05-17 06:19:41,wxid_zbytkn4qjl3r22,25713010771@chatroom,渗透测试三组——魁杓攻防研究实验室，\n 魁杓，指北斗七星，出自《淮南子·天文训...
7,2022-05-19 09:39:35,wxid_zbytkn4qjl3r22,25713010771@chatroom,渗透测试二组——行式实验室\n 行式（xing二声 shi二声)，原指故宫太和殿屋脊上的排行...
8,2022-05-19 09:39:44,wxid_zbytkn4qjl3r22,25713010771@chatroom,@0x1407 审下
9,2022-05-19 09:40:09,wxid_zbytkn4qjl3r22,25713010771@chatroom,



=== 20_docling.json - table 0 (shape: (69, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-06 12:09:40,wxid_5390224027312,wxid_7p054rmzkhqf21,陆总，英那边上次说的需要那个？外交是首选吗？
2,2022-05-06 12:10:19,wxid_5390224027312,wxid_7p054rmzkhqf21,就是最重要的搞下来确定会要的
3,2022-05-06 12:30:15,wxid_7p054rmzkhqf21,wxid_5390224027312,是的
4,2022-05-06 12:30:18,wxid_7p054rmzkhqf21,wxid_5390224027312,首选
...,...,...,...,...
64,2022-05-11 05:11:46,wxid_5390224027312,wxid_7p054rmzkhqf21,没有
65,2022-05-11 05:11:48,wxid_7p054rmzkhqf21,wxid_5390224027312,美玲说好久没见你了
66,2022-05-11 05:11:52,wxid_7p054rmzkhqf21,wxid_5390224027312,等你来打麻将
67,2022-05-11 05:11:55,wxid_5390224027312,wxid_7p054rmzkhqf21,可约



=== 10_docling.json - table 0 (shape: (150, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2020-03-07 05:14:18,nullroot,wei592628,我是null_whoami
2,2020-03-07 05:16:20,nullroot,wei592628,魏总，大概是哪方面？
3,2020-03-07 05:18:25,nullroot,wei592628,推特有些落查，监k
4,2020-03-07 05:18:58,wei592628,nullroot,那回头我给你找个账户，帮我看一下
...,...,...,...,...
145,2020-05-22 08:28:50,wei592628,nullroot,@dr_allah_nizar
146,2020-05-22 08:29:06,wei592628,nullroot,这个账号，帮查查看看
147,2020-05-22 08:29:11,wei592628,nullroot,谢谢兄弟
148,2020-05-22 09:58:29,nullroot,wei592628,没查到



=== 17_docling.json - table 0 (shape: (56, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-03-07 09:30:32,wxid_70w3p1jin84k22,wxid_5390224027312,杨师被劝退了
2,2022-03-07 09:30:47,wxid_70w3p1jin84k22,wxid_5390224027312,2年多的老员工，公司也是做得出来
3,2022-03-07 09:30:56,wxid_5390224027312,wxid_70w3p1jin84k22,哈，被劝退
4,2022-03-07 09:31:01,wxid_5390224027312,wxid_70w3p1jin84k22,啥情况
5,2022-03-07 09:31:31,wxid_70w3p1jin84k22,wxid_5390224027312,你觉得呢
6,2022-03-07 09:32:44,wxid_5390224027312,wxid_70w3p1jin84k22,哎，看来还是老火啊
7,2022-03-07 09:33:09,wxid_70w3p1jin84k22,wxid_5390224027312,这是杀鸡给侯看，让大家都看看结果
8,2022-03-07 09:35:18,wxid_5390224027312,wxid_70w3p1jin84k22,突然觉得C总好惨哦
9,2022-03-07 09:35:35,wxid_70w3p1jin84k22,wxid_5390224027312,是啊，我都觉得我当时就不该提离职



=== 21_docling.json - table 0 (shape: (1047, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-01 15:34:51,wxid_wh6x59w70y3r22,wxid_5390224027312,[旺柴]走私那个怎么说
2,2022-05-01 15:59:30,wxid_5390224027312,wxid_wh6x59w70y3r22,你别问了
3,2022-05-01 15:59:42,wxid_5390224027312,wxid_wh6x59w70y3r22,我两三年没联系别人
4,2022-05-01 15:59:51,wxid_wh6x59w70y3r22,wxid_5390224027312,[捂脸][捂脸]
...,...,...,...,...
1042,2022-05-25 14:39:01,wxid_wh6x59w70y3r22,wxid_5390224027312,连你一块挖过来
1043,2022-05-25 14:39:09,wxid_5390224027312,wxid_wh6x59w70y3r22,我不想
1044,2022-05-25 14:39:34,wxid_wh6x59w70y3r22,wxid_5390224027312,团队几个人啊？
1045,2022-05-25 14:39:56,wxid_5390224027312,wxid_wh6x59w70y3r22,客户的问题会不会太多了啊



=== 26_docling.json - table 0 (shape: (7, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-24 02:47:59,zhangxiaoyan0422,lengmo,前端销售反馈问题.doc
2,2022-05-24 02:48:28,zhangxiaoyan0422,lengmo,C总 这是昨天销售反馈的问题汇总
3,2022-05-24 04:15:30,lengmo,zhangxiaoyan0422,二楼 帮我拿下外卖
4,2022-05-24 04:16:00,zhangxiaoyan0422,lengmo,好的
5,2022-05-24 04:53:01,zhangxiaoyan0422,lengmo,这个重新发送给你一份，第一个少了一个曹飞的反馈
6,2022-05-24 04:54:29,zhangxiaoyan0422,lengmo,前端销售反馈问题.doc



=== 33_docling.json - table 0 (shape: (32, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-16 03:48:33,wxid_5390224027312,dujijiyiqxx,点外卖没，走去吃花溪牛肉粉
2,2022-05-16 03:49:12,dujijiyiqxx,wxid_5390224027312,小老弟请客
3,2022-05-16 03:49:30,wxid_5390224027312,dujijiyiqxx,好嘛
4,2022-05-16 06:17:27,dujijiyiqxx,wxid_5390224027312,我猜的准撒
5,2022-05-16 06:18:16,wxid_5390224027312,dujijiyiqxx,感觉老王自己开会把自己坑进去了
6,2022-05-16 06:25:26,dujijiyiqxx,wxid_5390224027312,嗯～～～～～～～～
7,2022-05-16 06:25:53,dujijiyiqxx,wxid_5390224027312,我之前就感觉到老吴对老王已经不像原来这么一味信任了
8,2022-05-16 06:26:16,wxid_5390224027312,dujijiyiqxx,你咋感觉的
9,2022-05-16 06:26:29,dujijiyiqxx,wxid_5390224027312,东哥应该在中间起了一定作用



=== 34_docling.json - table 0 (shape: (427, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2018-11-08 01:36:58,qq78263462,wxid_5390224027312,我昨天一天都在忙....
2,2018-11-08 01:37:12,qq78263462,wxid_5390224027312,你好久回呢？
3,2018-11-08 01:37:14,wxid_5390224027312,qq78263462,没事
4,2018-11-08 01:37:19,wxid_5390224027312,qq78263462,我可能要下周
...,...,...,...,...
422,2018-11-08 16:22:57,qq78263462,wxid_5390224027312,你不要跟别的人说起这些哈
423,2018-11-08 16:23:06,wxid_5390224027312,qq78263462,嗯嗯不得
424,2018-11-08 16:23:19,qq78263462,wxid_5390224027312,尤其是一楼的女的......
425,2018-11-08 16:23:21,qq78263462,wxid_5390224027312,哈哈哈哈哈



=== 3_docling.json - table 0 (shape: (24, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-07-05 05:12:48,wxid_7p054rmzkhqf21,wxid_jcnxegjccqi441,JYC.zip
2,2022-07-05 07:36:09,wxid_jcnxegjccqi441,wxid_7p054rmzkhqf21,[捂脸]就两个啊，客户那边说随机拖的话，没关键词，随机也拖10多个，他们好看啊
3,2022-07-05 07:36:42,wxid_jcnxegjccqi441,wxid_7p054rmzkhqf21,你看能不能搞，不行我们也就不推了，难受，帮他们卖感觉还两边舔[捂脸]
4,2022-07-05 07:42:06,wxid_7p054rmzkhqf21,wxid_jcnxegjccqi441,算了，别推了
5,2022-07-05 07:45:11,wxid_jcnxegjccqi441,wxid_7p054rmzkhqf21,好
6,2022-07-06 03:16:48,wxid_jcnxegjccqi441,wxid_7p054rmzkhqf21,昨天给我那个是不是给错了，客户看里面全是英文，像是北约什么的
7,2022-07-06 03:17:12,wxid_7p054rmzkhqf21,wxid_jcnxegjccqi441,确实，搞错了，昨天是北约的
8,2022-07-06 03:17:18,wxid_jcnxegjccqi441,wxid_7p054rmzkhqf21,。。。
9,2022-07-06 03:17:43,wxid_7p054rmzkhqf21,wxid_jcnxegjccqi441,[捂脸]



=== 4_docling.json - table 0 (shape: (514, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-12-02 11:45:00,adpw90,wxid_7p054rmzkhqf21,达赖政府的树收不收 客户
2,2021-12-02 11:46:42,wxid_7p054rmzkhqf21,adpw90,收，我可以问问，有没有小样
3,2021-12-02 11:52:24,adpw90,wxid_7p054rmzkhqf21,内网权限
4,2021-12-02 12:04:47,wxid_7p054rmzkhqf21,adpw90,控不住哦，我这边的用户
...,...,...,...,...
509,2022-07-07 06:26:01,wxid_7p054rmzkhqf21,adpw90,现在就是售前也不知道怎么改，张力总才找您
510,2022-07-10 04:59:56,adpw90,wxid_7p054rmzkhqf21,阿三外交 国防 中央调查局
511,2022-07-10 05:02:24,wxid_7p054rmzkhqf21,adpw90,阿三这几个外面都有，啥价格
512,2022-07-10 05:02:59,adpw90,wxid_7p054rmzkhqf21,[捂脸]看来都有了



=== 39_docling.json - table 0 (shape: (822, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-02-28 07:51:20,wxid_5390224027312,wxid_70w3p1jin84k22,白榆是不是也要离职了，看他在hr办公室[破涕为笑]
2,2022-02-28 07:58:56,wxid_70w3p1jin84k22,wxid_5390224027312,是的
3,2022-02-28 07:59:18,wxid_70w3p1jin84k22,wxid_5390224027312,他去找的hr么
4,2022-02-28 07:59:24,wxid_5390224027312,wxid_70w3p1jin84k22,公司真的留不住人了[捂脸]
...,...,...,...,...
817,2022-03-30 07:10:35,wxid_5390224027312,wxid_70w3p1jin84k22,哈哈哈哈大家都没报希望
818,2022-03-30 07:38:54,wxid_70w3p1jin84k22,wxid_5390224027312,你现在应该在回来路上了吧
819,2022-03-30 07:39:06,wxid_70w3p1jin84k22,wxid_5390224027312,不然你这到家都几点了
820,2022-03-30 07:46:18,wxid_5390224027312,wxid_70w3p1jin84k22,嗯，回来路上了，早上去的



=== 9_docling.json - table 0 (shape: (236, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2021-09-09 05:17:43,wxid_7p054rmzkhqf21,gzp1991101,你回来了么
2,2021-09-09 05:24:13,gzp1991101,wxid_7p054rmzkhqf21,还没
3,2021-09-09 05:24:20,wxid_7p054rmzkhqf21,gzp1991101,明天么
4,2021-09-09 05:24:27,gzp1991101,wxid_7p054rmzkhqf21,这样打算的
...,...,...,...,...
231,2021-09-13 15:59:47,wxid_7p054rmzkhqf21,gzp1991101,a感觉用不到，主要是其他行业，看看找点机会
232,2021-09-13 16:00:04,gzp1991101,wxid_7p054rmzkhqf21,啥时候跳槽
233,2021-09-13 16:00:20,wxid_7p054rmzkhqf21,gzp1991101,快了，我今天提了，估计最近
234,2021-09-13 16:00:59,gzp1991101,wxid_7p054rmzkhqf21,太难搞了



=== 38_docling.json - table 0 (shape: (22, 4)) ===


Unnamed: 0,Time,From,To,Message
0,Time,From,To,Message
1,2022-05-17 06:12:24,wxid_zbytkn4qjl3r22,lengmo,渗透测试三组——魁杓攻防研究实验室，\n 魁杓，指北斗七星，出自《淮南子·天文训...
2,2022-05-17 06:12:39,wxid_zbytkn4qjl3r22,lengmo,三组有内涵，名字可以哈？
3,2022-05-17 06:13:36,lengmo,wxid_zbytkn4qjl3r22,嗯
4,2022-05-17 06:13:43,lengmo,wxid_zbytkn4qjl3r22,要换名字？
5,2022-05-17 06:14:16,wxid_zbytkn4qjl3r22,lengmo,二组和三组，后面都以实验室运作，现在也没了1组，到B栋来了都要换个名字
6,2022-05-17 06:15:18,lengmo,wxid_zbytkn4qjl3r22,哦
7,2022-05-23 03:37:37,lengmo,wxid_zbytkn4qjl3r22,那个售前的事情，我和周伟伟聊了下··
8,2022-05-23 03:38:48,lengmo,wxid_zbytkn4qjl3r22,觉得他接也行，刚好生态管理部和售前也是一条线的，售前这块接触的厂商比较多，也可以反馈。
9,2022-05-23 04:11:27,wxid_zbytkn4qjl3r22,lengmo,和吴总沟通了下，他觉得是有必要，但还是比较顾虑整个团队的磨合和整体稳定性，方案1我们可以同步...



=== Loaded DataFrame Names ===
28_docling.json - table 0
19_docling.json - table 0
29_docling.json - table 0
18_docling.json - table 0
15_docling.json - table 0
40_docling.json - table 0
12_docling.json - table 0
24_docling.json - table 0
23_docling.json - table 0
6_docling.json - table 0
1_docling.json - table 0
36_docling.json - table 0
31_docling.json - table 0
13_docling.json - table 0
41_docling.json - table 0
14_docling.json - table 0
7_docling.json - table 0
30_docling.json - table 0
37_docling.json - table 0
22_docling.json - table 0
16_docling.json - table 0
11_docling.json - table 0
35_docling.json - table 0
32_docling.json - table 0
5_docling.json - table 0
2_docling.json - table 0
27_docling.json - table 0
20_docling.json - table 0
10_docling.json - table 0
17_docling.json - table 0
21_docling.json - table 0
26_docling.json - table 0
33_docling.json - table 0
34_docling.json - table 0
3_docling.json - table 0
4_docling.json - table 0
39_docling.json - table 0
9_docling.jso

# **DATAFRAMEs TO 1 CSV FOR CHAT ANALYSIS**

In [54]:
import os
import json
import pandas as pd

# Folder with JSON files
folder_path = "html_markdown_to_json"

# Final headers from LLM
headers_response = ["Time", "From", "To", "Message"]

# Track all DataFrames for merging
all_dfs = []

# Loop through all JSON files
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        json_path = os.path.join(folder_path, filename)
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                doc = json.load(f)

            tables = doc.get("tables", [])
            if not tables:
                continue

            for i, table in enumerate(tables):
                cells = table.get("data", {}).get("table_cells", [])

                # Organize cells by row index
                rows_dict = {}
                for cell in cells:
                    row_idx = cell.get("start_row_offset_idx")
                    col_idx = cell.get("start_col_offset_idx")
                    text = cell.get("text", "")
                    if row_idx is not None and col_idx is not None:
                        if row_idx not in rows_dict:
                            rows_dict[row_idx] = {}
                        rows_dict[row_idx][col_idx] = text

                # Sort rows and fill missing columns
                sorted_rows = []
                for row_idx in sorted(rows_dict.keys()):
                    row = rows_dict[row_idx]
                    max_col = max(row.keys()) if row else -1
                    row_list = [row.get(col, "") for col in range(max_col + 1)]
                    sorted_rows.append(row_list)

                if not sorted_rows:
                    continue

                # Align header with row width
                header_cols = headers_response[:len(sorted_rows[0])]
                df = pd.DataFrame(sorted_rows, columns=header_cols)

                # Optionally tag source
                df["SourceFile"] = f"{filename} - table {i}"

                # Store the DataFrame
                all_dfs.append(df)

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Merge all into a single DataFrame
if all_dfs:
    merged_df = pd.concat(all_dfs, ignore_index=True)
    merged_df.to_csv("combined_output.csv", index=False)
    print("✅ All data saved to 'combined_output.csv'")
else:
    print("⚠️ No DataFrames were loaded.")

✅ All data saved to 'combined_output.csv'
