In [None]:
import os
import pandas as pd
import re
from datetime import datetime

In [None]:
def split_into_sentences(text):
    # First split by traditional sentence endings
    text = re.sub(r'([.!?])\s+', r'\1|', text)
    # Split by multiple newlines or excessive spacing
    text = re.sub(r'\n{2,}', '|', text)
    text = re.sub(r'\s{3,}', '|', text)
    # Split the text and clean each sentence
    sentences = [s.strip() for s in text.split('|') if s.strip()]
    return sentences

In [None]:
def process_journal_files(folder_path):
    data = []
    
    # Get all txt files in the folder
    txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    
    for filename in txt_files:
        # Parse datetime from filename
        datetime_str = filename.split('.')[0]
        dt = datetime.strptime(datetime_str, '%Y-%m-%d_%H-%M-%S-%f')
        
        # Read file content
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Split into sentences
        sentences = split_into_sentences(content)
        
        # Create entries for each sentence
        for idx, sentence in enumerate(sentences, 1):
            data.append({
                'doc_id': filename,
                'date': dt.date(),
                'year': dt.year,
                'month': dt.month,
                'day': dt.day,
                'hour': dt.hour,
                'sentence': idx,
                'text': sentence
            })
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(data)
    df.to_csv('journal_entries.csv', index=False, sep='|')
    print(f"Processed {len(txt_files)} files and created {len(data)} sentence entries.")

In [None]:
folder_path = "Phone_Journal"
process_journal_files(folder_path)

Processed 5 files and created 25 sentence entries.
