# Sentiment classification (local, VS Code + Jupyter)

This notebook reads a CSV (default: `df_joined_sample.csv`), classifies each row’s text with a **local Ollama model** via **LangChain**, and writes `df_joined_sample_sentiment.csv`.

**Prereqs (outside notebook):**
- Install Ollama and pull a model, e.g.: `ollama pull gpt-oss:20b`
- Create/activate a Python venv in this folder, then install: `pip install -U pandas numpy langchain-ollama langchain-core`

If you change the model name, edit `MODEL_NAME` below.


In [6]:
# --- Imports & configuration ---
from pathlib import Path
import re
import time

import pandas as pd

from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

# Model + data config
MODEL_NAME = "gpt-oss:20b"   # e.g. "llama3.1" gpt-oss:20b
INPUT_CSV  = Path("df_joined_sample.csv")
OUTPUT_CSV = Path("df_joined_sample_sentiment.csv")


# Quick sanity check: does the input file exist?
if not INPUT_CSV.exists():
    raise FileNotFoundError(
        f"Can't find {INPUT_CSV.resolve()}\n"
        "Put df_joined_sample.csv in the same folder as this notebook, or change INPUT_CSV."
    )

# Optional: if your Ollama server is not the default, you can set:
# import os
# os.environ['OLLAMA_HOST'] = 'http://127.0.0.1:11434'  # default is usually this


In [7]:
# --- Load data ---
df = pd.read_csv(INPUT_CSV, encoding="latin1")
df = df[(df["random_order"].between(1, 50))] \
       .sort_values("random_order")

# Try to locate a text column
if "txt" in df.columns:
    TEXT_COL = "txt"
else:
    # fallback: pick the first object/string-like column
    text_candidates = [c for c in df.columns if df[c].dtype == "object"]
    if not text_candidates:
        raise ValueError(
            f"No obvious text column found. Columns are: {list(df.columns)}\n"
            "Rename your text column to 'txt' or edit TEXT_COL logic."
        )
    TEXT_COL = text_candidates[0]

# Ensure we have a stable row id / progress counter
if "random_order" not in df.columns:
    df = df.reset_index().rename(columns={"index": "random_order"})

df_joined_sample = df.copy()

df_joined_sample.shape, TEXT_COL, df_joined_sample.columns.tolist()


((50, 18),
 'text',
 ['text',
  'source',
  'source_clean',
  'doc_id',
  'words',
  'doc_date',
  'title',
  'par_id',
  'doc_date1',
  'date_clean',
  'date_clean1',
  'outside_range',
  'mentions_france_paris',
  'Name',
  'Country',
  'Continent',
  'Type_newspaper',
  'random_order'])

In [8]:
# --- Build prompt + model chain ---
system_prompt = """You are a professional sentiment analysis system.

Your task: classify the emotional tone of the paragraph below by **language/tone only**.
Do NOT infer sentiment from the *event* being described (wins, losses, crises, success, etc.).
Focus on evaluative wording (approval/criticism), emotional tone, and affect.

Labels:
- positive: enthusiasm, approval, joy, relief, celebration, admiration, uplifting tone
- negative: criticism, disappointment, concern, frustration, anger, tension, discouraging tone
- neutral: mostly factual/descriptive/balanced/flat, no clear evaluative language

Output rules:
- Reply with exactly ONE word: positive, neutral, or negative.
- No punctuation, no explanation, no extra text.
"""

user_prompt = """Paragraph:
{text}

Label:"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", user_prompt),
])

# OllamaLLM talks to the local Ollama server (default http://127.0.0.1:11434)
model = OllamaLLM(model=MODEL_NAME, temperature=0)

chain = prompt | model

# Quick connectivity test:
print(chain.invoke({"text": "The government welcomed the decision and praised the team for its excellent work."}))


ResponseError: model requires more system memory (8.0 GiB) than is available (6.1 GiB) (status code: 500)

In [10]:
# --- Classifier helper ---
VALID = {"positive", "neutral", "negative"}

def classify_sentiment(text, retries=3, sleep_seconds=1.5):
    if pd.isna(text):
        return "neutral"
    text = str(text)

    for attempt in range(retries + 1):
        try:
            out = chain.invoke({"text": text})
            out = out.strip().lower()

            # If the model adds extra words, extract the first valid label we see
            m = re.search(r"\b(positive|neutral|negative)\b", out)
            if m:
                return m.group(1)

            return "error"
        except Exception:
            if attempt == retries:
                return "error"
            time.sleep(sleep_seconds)


In [None]:
# --- Run classification ---
results = []
max_order = df_joined_sample["random_order"].max()

for _, row in df_joined_sample.iterrows():
    paragraph = row[TEXT_COL]
    sentiment = classify_sentiment(paragraph)
    results.append(sentiment)
    print(f"{row['random_order']}/{max_order} → {sentiment}")

df_joined_sample["sentiment"] = results
df_joined_sample[["random_order", TEXT_COL, "sentiment"]].head()


In [12]:
# --- Save output ---
df_joined_sample.to_csv(OUTPUT_CSV, index=False)
print(f"Wrote: {OUTPUT_CSV.resolve()}") 


Wrote: C:\Users\duera\Downloads\df_joined_sample_sentiment.csv
