In [36]:
import pandas as pd
import json
from tqdm import tqdm

# Load CSV
df = pd.read_csv("reddit_greenwashing_posts.csv")

# df = df.sample(20)
# print(df.head())
# Fill NaNs
df['Article_Content'] = df['Article_Content'].fillna('')


In [37]:
import subprocess

def is_relevant_llm(text, model="llama3.2"):
    prompt = f"""You're an assistant helping analyze online posts for greenwashing. 
Is the following Reddit post about greenwashing or related to corporate sustainability claims?

Respond only with YES or NO.

Post:
{text}
"""
    result = subprocess.run(
        ["ollama", "run", model],
        input=prompt.encode("utf-8"),
        stdout=subprocess.PIPE
    )
    out = result.stdout.decode("utf-8").strip().lower()
    return 'yes' in out


In [38]:
from tqdm import tqdm

# Run LLM-based relevance filter (labeling step)
tqdm.pandas()
df["is_relevant"] = df["Article_Content"].progress_apply(is_relevant_llm)

# This df now has all posts, labeled
labeled_df = df.copy()

# If needed: subset of only relevant posts (for use in indexing etc.)
filtered_df = labeled_df[labeled_df["is_relevant"] == True].reset_index(drop=True)


100%|██████████| 92/92 [03:08<00:00,  2.05s/it][?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠇ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?

In [39]:
labeled_df

Unnamed: 0,Article_Id,Article_Url,Post_id,Article_Title,Article_Content,is_relevant
0,1,http://www.greenwashingindex.com/about-greenwa...,1gln79,ABOUT GREENWASHING - GreenWashing Index,Everyone’s heard the expression “whitewashing”...,False
1,2,http://en.wikipedia.org/wiki/Dean_Foods#Products,1glqou,Dean Foods - Wikipedia,Dean Foods was an American food and beverage c...,False
2,3,http://www.motherjones.com/environment/2015/04...,34bigd,Starbucks Wants You to Feel Good About Drinkin...,,False
3,4,https://www.civilbeat.org/2018/07/is-wireless-...,8xsb69,Is Wireless Phone Giant Using Hawaii To Greenw...,News That Matters Support usAn environmental g...,True
4,5,https://www.naturalnews.com/2018-11-05-organic...,9uq7ll,"Organic JUNK food? You can waste your money, w...",Canola oil is the cheapest oil on the market a...,False
...,...,...,...,...,...,...
87,88,https://sustainabilitymag.com/articles/the-sta...,1ggy7hr,The State of Greenwashing Around the World | S...,"Greenwashing has enormous implications, both l...",False
88,89,https://www.bbc.co.uk/news/articles/crmzvdn9e18o,1gmd0az,COP29 chief secretly filmed promoting fossil f...,A senior official at COP29 climate change conf...,False
89,90,https://groundtruth.app/brazils-9-billion-refo...,1go6ng4,Brazil’s $9 Billion Reforestation Plan🌱💸,"If managed well, Brazil’s ambitious plan could...",False
90,91,https://groundtruth.app/do-we-have-enough-land...,1gqdyjk,Do We Have Enough Land to Save the Planet? 🌎💚,Scholars from MIT and Shell have thoughts.Gurg...,False


In [40]:
from transformers import pipeline

# Load NER pipeline
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def extract_company(text):
    entities = ner(text)
    orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
    return orgs  # Returns a list of all detected organization names



Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [30]:
filtered_df["company_name"] = filtered_df["Article_Content"].progress_apply(extract_company)

100%|██████████| 15/15 [00:00<00:00, 40.77it/s]


In [41]:
labeled_df["company_name"] = labeled_df["Article_Content"].progress_apply(extract_company)

100%|██████████| 92/92 [00:01<00:00, 47.63it/s]


In [31]:
filtered_df

Unnamed: 0,Article_Id,Article_Url,Post_id,Article_Title,Article_Content,is_relevant,company_name
0,1,http://www.greenwashingindex.com/about-greenwa...,1gln79,ABOUT GREENWASHING - GreenWashing Index,Everyone’s heard the expression “whitewashing”...,True,[]
1,4,https://www.civilbeat.org/2018/07/is-wireless-...,8xsb69,Is Wireless Phone Giant Using Hawaii To Greenw...,News That Matters Support usAn environmental g...,True,"[News That Matters Support usA, Verizon Commun..."
2,11,https://www.greentechmedia.com/articles/read/n...,euxqlw,Where Does the Natural Gas 'Bridge' End? | Gre...,ABU DHABI — The role of natural gas is one of ...,True,"[ABU DHABI, Tellurian, Atlantic Council, Inter..."
3,16,https://www.bbc.co.uk/news/science-environment...,ew84dk,Climate change: Worst emissions scenario 'exce...,The worst-case scenario for emissions of CO2 t...,True,"[Intergovernmental Panel on Climate Change, IP..."
4,24,https://www.aei.org/carpe-diem/there-is-no-cli...,f0n4l6,"There Is No Climate Emergency, Say 500 Experts...","By Mark J. PerryCarpe DiemOctober 01, 2019The ...",True,"[##pe Die, Friends of Science, United Nations,..."
5,40,https://www.manhattan-institute.org/if-you-wan...,f967zu,"If You Want ‘Renewable Energy,’ Get Ready to D...",Building one wind turbine requires 900 tons of...,True,"[International Renewable Energy Agency, Intern..."
6,52,https://www.themoscowtimes.com/2020/05/14/russ...,gjwh0b,Russia’s Trash-Burning Plants Could Fuel Unres...,Russia’s ambitious plans to build 25 waste-bur...,True,"[Greenpeace Russia, R, ##ostec, Rosatom, VEB, ..."
7,56,https://robbreport.com/motors/marine/feadship-...,moxgkx,Feadship's Escape Gigayacht Comes With a Tree ...,Few yachts are designed with an eco-conscious ...,True,"[Feads, Feadship, Feads, Robb Report]"
8,57,https://www.cnn.com/2021/04/20/business/taco-b...,muyg5i,Taco Bell will start reusing hot sauce packets...,Taco Bell’s plastic sauce packets have a short...,True,"[Taco Bell, TerraCycle, Taco Bell, Taco Bell, ..."
9,61,https://illuminem.com/energyvoices/9f84f39a-91...,qv31no,Will the Glasgow Declaration save the world’s ...,By Ashish KothariNov 15 2021 · 6 min readOn 2n...,True,"[Peoples, Bo, World, Organisation, World Bank,..."


In [42]:
labeled_df

Unnamed: 0,Article_Id,Article_Url,Post_id,Article_Title,Article_Content,is_relevant,company_name
0,1,http://www.greenwashingindex.com/about-greenwa...,1gln79,ABOUT GREENWASHING - GreenWashing Index,Everyone’s heard the expression “whitewashing”...,False,[]
1,2,http://en.wikipedia.org/wiki/Dean_Foods#Products,1glqou,Dean Foods - Wikipedia,Dean Foods was an American food and beverage c...,False,"[Dean Foods, Dean Foods, Dean, DairyPure, TruM..."
2,3,http://www.motherjones.com/environment/2015/04...,34bigd,Starbucks Wants You to Feel Good About Drinkin...,,False,[]
3,4,https://www.civilbeat.org/2018/07/is-wireless-...,8xsb69,Is Wireless Phone Giant Using Hawaii To Greenw...,News That Matters Support usAn environmental g...,True,"[News That Matters Support usA, Verizon Commun..."
4,5,https://www.naturalnews.com/2018-11-05-organic...,9uq7ll,"Organic JUNK food? You can waste your money, w...",Canola oil is the cheapest oil on the market a...,False,"[USDA, MS]"
...,...,...,...,...,...,...,...
87,88,https://sustainabilitymag.com/articles/the-sta...,1ggy7hr,The State of Greenwashing Around the World | S...,"Greenwashing has enormous implications, both l...",False,"[RepRisk, Greentech Alliance, RepRisk, EU, Cap..."
88,89,https://www.bbc.co.uk/news/articles/crmzvdn9e18o,1gmd0az,COP29 chief secretly filmed promoting fossil f...,A senior official at COP29 climate change conf...,False,"[COP29, BBC, COP29, UN, BBC, CO, COP29, Socar,..."
89,90,https://groundtruth.app/brazils-9-billion-refo...,1go6ng4,Brazil’s $9 Billion Reforestation Plan🌱💸,"If managed well, Brazil’s ambitious plan could...",False,"[##J, ##zan]"
90,91,https://groundtruth.app/do-we-have-enough-land...,1gqdyjk,Do We Have Enough Land to Save the Planet? 🌎💚,Scholars from MIT and Shell have thoughts.Gurg...,False,"[MIT, Shell, Front, Environ, Sc, ##lo, Paltsev..."


In [43]:
import re
def extract_techniques(text, model="llama3.2"):
    prompt = f"""You're a greenwashing expert. Read the post below and list any greenwashing techniques it describes.

Output as a JSON list of strings (e.g., ["carbon offsets", "green packaging"]).

Post:
{text}
"""
    result = subprocess.run(
        ["ollama", "run", model],
        input=prompt.encode("utf-8"),
        stdout=subprocess.PIPE
    )
    try:
        out = result.stdout.decode("utf-8")
        match = re.search(r'\[.*?\]', out, re.DOTALL)
        return json.loads(match.group(0)) if match else []
    except Exception:
        return []


In [32]:
filtered_df["techniques"] = filtered_df["Article_Content"].progress_apply(extract_techniques)
filtered_df

100%|██████████| 15/15 [00:26<00:00,  1.75s/it][?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠇ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?2

Unnamed: 0,Article_Id,Article_Url,Post_id,Article_Title,Article_Content,is_relevant,company_name,techniques
0,1,http://www.greenwashingindex.com/about-greenwa...,1gln79,ABOUT GREENWASHING - GreenWashing Index,Everyone’s heard the expression “whitewashing”...,True,[],"[Green marketing, Advertising and marketing cl..."
1,4,https://www.civilbeat.org/2018/07/is-wireless-...,8xsb69,Is Wireless Phone Giant Using Hawaii To Greenw...,News That Matters Support usAn environmental g...,True,"[News That Matters Support usA, Verizon Commun...",[using scenic imagery to create a false impres...
2,11,https://www.greentechmedia.com/articles/read/n...,euxqlw,Where Does the Natural Gas 'Bridge' End? | Gre...,ABU DHABI — The role of natural gas is one of ...,True,"[ABU DHABI, Tellurian, Atlantic Council, Inter...","[carbon offsets, renewable energy penetration ..."
3,16,https://www.bbc.co.uk/news/science-environment...,ew84dk,Climate change: Worst emissions scenario 'exce...,The worst-case scenario for emissions of CO2 t...,True,"[Intergovernmental Panel on Climate Change, IP...",[]
4,24,https://www.aei.org/carpe-diem/there-is-no-cli...,f0n4l6,"There Is No Climate Emergency, Say 500 Experts...","By Mark J. PerryCarpe DiemOctober 01, 2019The ...",True,"[##pe Die, Friends of Science, United Nations,...","[climate models, carbon sequestration]"
5,40,https://www.manhattan-institute.org/if-you-wan...,f967zu,"If You Want ‘Renewable Energy,’ Get Ready to D...",Building one wind turbine requires 900 tons of...,True,"[International Renewable Energy Agency, Intern...","[carbon offsets, green packaging, magical ener..."
6,52,https://www.themoscowtimes.com/2020/05/14/russ...,gjwh0b,Russia’s Trash-Burning Plants Could Fuel Unres...,Russia’s ambitious plans to build 25 waste-bur...,True,"[Greenpeace Russia, R, ##ostec, Rosatom, VEB, ...","[carbon offsets, green packaging, waste-to-ene..."
7,56,https://robbreport.com/motors/marine/feadship-...,moxgkx,Feadship's Escape Gigayacht Comes With a Tree ...,Few yachts are designed with an eco-conscious ...,True,"[Feads, Feadship, Feads, Robb Report]","[carbon offsets, green packaging, sustainable ..."
8,57,https://www.cnn.com/2021/04/20/business/taco-b...,muyg5i,Taco Bell will start reusing hot sauce packets...,Taco Bell’s plastic sauce packets have a short...,True,"[Taco Bell, TerraCycle, Taco Bell, Taco Bell, ...","[Partnership with TerraCycle, Claiming to have..."
9,61,https://illuminem.com/energyvoices/9f84f39a-91...,qv31no,Will the Glasgow Declaration save the world’s ...,By Ashish KothariNov 15 2021 · 6 min readOn 2n...,True,"[Peoples, Bo, World, Organisation, World Bank,...","[catch-all phrase, vagueness, loopholes, lack ..."


In [44]:
labeled_df["techniques"] = labeled_df["Article_Content"].progress_apply(extract_techniques)
labeled_df

100%|██████████| 92/92 [03:49<00:00,  2.50s/it][?25l[1G⠙ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?25l[?25h[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?2

Unnamed: 0,Article_Id,Article_Url,Post_id,Article_Title,Article_Content,is_relevant,company_name,techniques
0,1,http://www.greenwashingindex.com/about-greenwa...,1gln79,ABOUT GREENWASHING - GreenWashing Index,Everyone’s heard the expression “whitewashing”...,False,[],"[Misleading advertising and marketing, Exagger..."
1,2,http://en.wikipedia.org/wiki/Dean_Foods#Products,1glqou,Dean Foods - Wikipedia,Dean Foods was an American food and beverage c...,False,"[Dean Foods, Dean Foods, Dean, DairyPure, TruM...",[]
2,3,http://www.motherjones.com/environment/2015/04...,34bigd,Starbucks Wants You to Feel Good About Drinkin...,,False,[],[]
3,4,https://www.civilbeat.org/2018/07/is-wireless-...,8xsb69,Is Wireless Phone Giant Using Hawaii To Greenw...,News That Matters Support usAn environmental g...,True,"[News That Matters Support usA, Verizon Commun...","[greenwashing, misleading viewers about compan..."
4,5,https://www.naturalnews.com/2018-11-05-organic...,9uq7ll,"Organic JUNK food? You can waste your money, w...",Canola oil is the cheapest oil on the market a...,False,"[USDA, MS]",[]
...,...,...,...,...,...,...,...,...
87,88,https://sustainabilitymag.com/articles/the-sta...,1ggy7hr,The State of Greenwashing Around the World | S...,"Greenwashing has enormous implications, both l...",False,"[RepRisk, Greentech Alliance, RepRisk, EU, Cap...",[]
88,89,https://www.bbc.co.uk/news/articles/crmzvdn9e18o,1gmd0az,COP29 chief secretly filmed promoting fossil f...,A senior official at COP29 climate change conf...,False,"[COP29, BBC, COP29, UN, BBC, CO, COP29, Socar,...",[]
89,90,https://groundtruth.app/brazils-9-billion-refo...,1go6ng4,Brazil’s $9 Billion Reforestation Plan🌱💸,"If managed well, Brazil’s ambitious plan could...",False,"[##J, ##zan]",[]
90,91,https://groundtruth.app/do-we-have-enough-land...,1gqdyjk,Do We Have Enough Land to Save the Planet? 🌎💚,Scholars from MIT and Shell have thoughts.Gurg...,False,"[MIT, Shell, Front, Environ, Sc, ##lo, Paltsev...",[]


In [33]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

model = SentenceTransformer("thenlper/gte-base")

chroma_client = chromadb.Client(Settings(
    persist_directory="./reddit_chroma",
    anonymized_telemetry=False
))
collection = chroma_client.get_or_create_collection(name="reddit_posts")

for i, row in filtered_df.iterrows():
    text = row['Article_Content']
    
    # Clean techniques and companies (flatten lists if needed)
    techniques = ", ".join(row["techniques"]) if isinstance(row["techniques"], list) else str(row["techniques"])
    companies = ", ".join(row["company_name"]) if isinstance(row["company_name"], list) else str(row["company_name"])
    
    meta = {
        "post_id": row["Post_id"],
        "url": row["Article_Url"],
        "company": companies,         # now stores multiple companies as comma-separated string
        "techniques": techniques
    }

    embedding = model.encode(text)
    
    collection.add(
        documents=[text],
        metadatas=[meta],
        ids=[str(row["Post_id"])],
        embeddings=[embedding]
    )


Add of existing embedding ID: f0n4l6
Insert of existing embedding ID: f0n4l6
Add of existing embedding ID: f967zu
Insert of existing embedding ID: f967zu
Add of existing embedding ID: gjwh0b
Insert of existing embedding ID: gjwh0b
Add of existing embedding ID: moxgkx
Insert of existing embedding ID: moxgkx
Add of existing embedding ID: qv31no
Insert of existing embedding ID: qv31no
Add of existing embedding ID: wejgqv
Insert of existing embedding ID: wejgqv
Add of existing embedding ID: 1gr7xos
Insert of existing embedding ID: 1gr7xos


NameError: name 'query_emb' is not defined