# PROMPT Used

## My current code
```python
import re
import csv

def extract_sections(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Pattern to identify headings and sections
    pattern = r'(#+ .+?)(?=\n#+ |\Z)'

    # Extract sections using regular expressions
    sections = re.findall(pattern, content, flags=re.DOTALL | re.MULTILINE)

    # Prepare data for CSV
    data = [{'Output': section.strip()} for section in sections]

    # Write data to a CSV file
    with open('extracted_sections.csv', 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=['Output'])
        writer.writeheader()
        writer.writerows(data)

# Replace with your actual file path
file_path = './(tmp) Salary Article MD.md'
extract_sections(file_path)
```

## From the LangChain docs

The simplest composition is just combining a prompt and model to create a chain that takes user input, adds it to a prompt, passes it to a model, and returns the raw model output.

```
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("tell me a joke about {foo}")
model = ChatOpenAI()
chain = prompt | model
```

```
chain.invoke({"foo": "bears"})
```

## Rewriting Prompt:

```md
Read my article section:
___START_OF_SECTION___
{article_section}
___END_OF_SECTION___

## YOUR TASK:

Rewrite it in a **completely different writing style**. 

All pieces of information and links must be preserved. 
The heading hierarchy must be preserved, for example heading 2's must stay heading 2's and so on. 
Tables and bullet points must be written in a different structure.
```

## TASK

Write python code that reads the output file `extracted_sections.csv`, and for each row/value of `Output`, adds two columns:
- `ID`, starting at 1.
- `Input`, which includes the reply of gpt-4 using LangChain to the `Rewriting Prompt` above. `{article_section}` is the section markdown from the `Output` column.

# The code

In [None]:
import re
import csv
import pandas as pd
from langchain.prompts import load_prompt
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache

# Initialize LangChain
# output_parser = StrOutputParser()
# model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0.7, max_tokens=2000)
# model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3, max_tokens=2000)

# set_llm_cache(SQLiteCache(database_path="../.langchain.db"))

# Load the WRITEP prompt and set up the LangChain chain
# rewrite_prompt = load_prompt("PROMPT_REWRITE_STYLE.json")
# chain = rewrite_prompt | model | output_parser

def word_count(text):
    return len(text.split())

def extract_sections(path_input, path_output):
    with open(path_input, 'r', encoding='utf-8') as file:
        content = file.read()

    pattern = r'(#+ .+?)(?=\n#+ |\Z)'
    sections = re.findall(pattern, content, flags=re.DOTALL | re.MULTILINE)

    data = {'ID': [], 'INPUT: TEXT_1': []}
    accumulated_section = ""
    section_id = 1

    for section in sections:
        if word_count(accumulated_section + section) < 300:
            accumulated_section += "\n\n" + section
            accumulated_section = accumulated_section.strip()
        else:
            # print(f'Processing section {section_id}...')
            # rewritten_section = chain.invoke({"article_section": accumulated_section})
            data['ID'].append(section_id)
            data['INPUT: TEXT_1'].append(accumulated_section)
            # data['Input'].append(rewritten_section)
            accumulated_section = section.strip()
            section_id += 1
    
    # Process any remaining accumulated section
    if accumulated_section:
        # accumulated_section = accumulated_section
        # print(f'Processing section {section_id}...')
        # rewritten_section = chain.invoke({"article_section": accumulated_section})
        data['ID'].append(section_id)
        data['INPUT: TEXT_1'].append(accumulated_section)
        # data['Input'].append(rewritten_section)

    df = pd.DataFrame(data)
    # df.to_csv('dataset-writing-style-salary-veronica.csv', index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    df.to_excel(path_output, index=False)

# Replace with your actual file path
path_input = './test_not_v_3.md'
path_output = 'writing-style-not_v_3.xlsx'
extract_sections(path_input, path_output)

print('Done!')

In [None]:
model_eval = ChatOpenAI(model="gpt-4-1106-preview", temperature=0.0, max_tokens=2000)
# model_eval = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0, max_tokens=500)

# Load the PROMPT_EVAL_WRITING_STYLE and set up the LangChain chain
eval_prompt = load_prompt("PROMPT_EVAL_WRITING_STYLE.json")
chain = eval_prompt | model_eval | output_parser

# Load the markdown file as a string
with open('test_not_v.md', 'r') as file:
    md_string = file.read().replace('\n', ' ')

def evaluate_section(row):
    print(f'Evaluating section {row["ID"]}...')
    evaluation = chain.invoke(
        {"TEXT_1": row["Output"], "TEXT_2": row["Input"]}
        # {"TEXT_1": row["Input"], "TEXT_2": row["Output"]}
        # {"TEXT_1": row["Output"], "TEXT_2": md_string}
        # {"TEXT_1": md_string, "TEXT_2": row["Output"]}
    )
    return evaluation


# For each section, evaluate the writing style
df = pd.read_excel(path_output)
# df = df.drop(columns=['Input'])
df["Evaluation"] = df.apply(evaluate_section, axis=1)

# Extract the score using regex. Example `SCORE: 9` - score is an integer between 0 and 10
pattern = r"SCORE: (\d+)"
df["Score"] = df["Evaluation"].str.extract(pattern, expand=False).astype(int)

df