In [1]:
import os
import re
import pandas as pd

def parse_chat_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Extract group name
    group_match = re.search(r'\](.*?):', content)
    group_name = group_match.group(1).strip() if group_match else None

    # Extract messages using regex
    pattern = re.compile(r'\[(\d{1,2}.\d{2}.\d{4} \d{2}:\d{2}:\d{2})\] ~\s*([^:]+): (.*?)((?=\n\[\d{1,2}.\d{2}.\d{4} \d{2}:\d{2}:\d{2}\])|$)', re.DOTALL)
    matches = pattern.findall(content)

    # Extract information and store in a list of dictionaries
    data = []
    for match in matches:
        timestamp, sender, message = match[0], match[1], match[2].strip()
        date, time = timestamp.split()
        day, month, year = map(int, date.split('.'))
        hour, minute, second = map(int, time.split(':'))

        data.append({
            'Name of the sender': sender.strip(),
            'day': day,
            'month': month,
            'year': year,
            'hour': hour,
            'minute': minute,
            'second': second,
            'message length': len(message),
            'group name': group_name
        })

    return data

def create_dataframe(directory_path):
    file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.txt')]

    # Iterate through files and parse each one
    all_data = []
    for file_path in file_paths:
        all_data.extend(parse_chat_file(file_path))

    # Create DataFrame
    df = pd.DataFrame(all_data)

    return df

# Replace 'your_directory_path' with the actual path to your directory containing txt files
directory_path = '/Users/bahadir/Downloads/210_data/chats'
df = create_dataframe(directory_path)

# Display the DataFrame
print(df)


      Name of the sender  day  month  year  hour  minute  second  \
0                Su Geek   21      9  2021    13      50      51   
1             Berkay Koç    2      3  2023    17      31      53   
2                    Eda    2      3  2023    17      36      46   
3             Arda Şafak    2      3  2023    17      42      38   
4             OytunKuday    2      3  2023    17      49       8   
...                  ...  ...    ...   ...   ...     ...     ...   
34178              Bensu   17      5  2023    16      14      14   
34179              Bensu   17      5  2023    16      14      23   
34180        Ceyda Çağcı   17      5  2023    16      14      35   
34181         Efe Ballar   17      5  2023    16      18      10   
34182         Efe Ballar   17      5  2023    16      18      15   

       message length                  group name  
0                  29  Sabancı Management Society  
1                  51  Sabancı Management Society  
2                  44  Saba