# Text processing and sentiment Analysis

## Research Question 1
Do ESG-focused communications influence consumer sentiment and trust?

Hypotheses:
- H1: ESG ads positively impact consumer trust 
- H1a: Inauthentic ESG ads create negative sentiment (backlash)

Methodology:
1. Text preprocessing (cleaning, translation)
2. RoBERTa sentiment classification
3. BART zero-shot refinement for low-confidence neutrals
4. Purchase intent detection
5. Statistical analysis

In [None]:
import os
import re
import string
import numpy as np
import pandas as pd
from transformers import pipeline
from langdetect import detect, DetectorFactory
import matplotlib.pyplot as plt
import seaborn as sns

# Settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
DetectorFactory.seed = 0

# Load data 
df = pd.read_csv("campaign_comments.csv")



In [None]:
## text cleaning

URL_RE = re.compile(r"http\S+")
USER_RE = re.compile(r"@\w+")

def clean_text(txt: str) -> str:
    if pd.isna(txt):
        return ""
    txt = str(txt)
    txt = URL_RE.sub(" <URL> ", txt)
    txt = USER_RE.sub(" <USER> ", txt)
    txt = txt.translate(str.maketrans("", "", string.punctuation)).lower()
    return re.sub(r"\s{2,}", " ", txt).strip()

df["clean"] = df["text"].apply(clean_text)

print("\nðŸ”§ Cleaning Example:")
print(f"Original: {df['text'].iloc[0]}")
print(f"Cleaned:  {df['clean'].iloc[0]}")

In [None]:
##language detection 

def safe_detect(txt):
    try:
        return detect(txt) if txt.strip() else "unknown"
    except:
        return "unknown"

df["lang"] = df["clean"].apply(safe_detect)

lang_dist = df["lang"].value_counts()
print(lang_dist.head())


In [None]:
# translation to english

from transformers import pipeline as hf_pipe

translator = hf_pipe(
    task="translation",
    model="facebook/nllb-200-distilled-600M",
    src_lang="auto",
    tgt_lang="eng_Latn",
    max_length=256,
    device_map="auto"
)
print("Translator loaded")

def to_english(row):
    if row.lang in ["en", "unknown"]:
        return row.clean
    try:
        tr = translator(row.clean, max_length=256)[0]["translation_text"]
        return str(tr).lower()
    except:
        return row.clean

df["clean_en"] = df.apply(to_english, axis=1)
print(" complete")


In [None]:
#sentiment classification

#roberta sentiment 

sent_clf = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    truncation=True,
    max_length=512
)

sent_out = sent_clf(df["clean_en"].tolist(), batch_size=32)
df["roberta_sentiment"] = [o["label"].lower() for o in sent_out]
df["sent_score"] = [o["score"] for o in sent_out]
print(df["roberta_sentiment"].value_counts())





In [None]:



###  Zero-Shot Refinement (BART)
#Reclassify low-confidence neutral comments (<0.85) using BART.

zs_clf = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    truncation=True
)
ZS_SENTIMENT_LABELS = ["positive", "neutral", "negative"]

def classify_sentiment_zs(text):
    """Zero-shot sentiment classification."""
    try:
        out = zs_clf(
            text,
            candidate_labels=ZS_SENTIMENT_LABELS,
            hypothesis_template="This comment expresses a {} sentiment."
        )
        return out["labels"][0].lower()
    except:
        return "unknown"


# Apply only to low-confidence neutrals
neutral_mask = (df["roberta_sentiment"] == "neutral") & (df["sent_score"] < 0.85)
df["zero_shot_sentiment"] = None
df.loc[neutral_mask, "zero_shot_sentiment"] = df.loc[neutral_mask, "clean_en"].apply(classify_sentiment_zs)


# Emotional Positive Detection
#Rule-based detection for highly emotional positive language.



EMOTIONAL_POSITIVE_PHRASES = [
    "made me cry", "i cried", "crying", "tears", "emotional",
    "so touching", "beautiful", "inspiring", "goosebumps",
    "love this", "uplifting", "empowering", "gave me chills",
    "stunning", "heartwarming", "ðŸ˜­", "ðŸ¥²", "ðŸ¥¹"
]

def is_emotional_positive(text):
    text_lower = str(text).lower()
    return any(p in text_lower for p in EMOTIONAL_POSITIVE_PHRASES)




In [None]:
##  Final Sentiment Decision
#Combine RoBERTa, zero-shot, and emotional detection.


def choose_final_sentiment(row):
    """
    Decision hierarchy:
    1. Emotional positive phrases then positive
    2. RoBERTa if not neutral then use RoBERTa sentiment
    3. RoBERTa neutral with high confidence (â‰¥0.85) then neutral
    4. Zero-shot result we use  use zero-shot
    5. Default then neutral
    """
    if is_emotional_positive(row.get("clean_en", "")):
        return "positive"
    if row["roberta_sentiment"] != "neutral":
        return row["roberta_sentiment"]
    if row["sent_score"] >= 0.85:
        return "neutral"
    zs = row.get("zero_shot_sentiment", "unknown")
    if zs in ["positive", "negative"]:
        return zs
    return "neutral"

df["final_sentiment"] = df.apply(choose_final_sentiment, axis=1)

print("\n Final Sentiment Distribution:")
print(df["final_sentiment"].value_counts())




In [None]:
##  Purchase Intent Analysis

## Detect purchase intentions using keywords + zero-shot classification.

POS_KW = {"buy", "buying", "bought", "purchase", "order", "need this"}
NEG_KW = {"won't buy", "not buying", "too expensive", "overpriced", "never buy"}

def has_kw(text, kw_set):
    """Check if text contains keywords, word boundaries for single words)."""
    t = str(text).lower()
    for kw in kw_set:
        if " " in kw:
            if kw in t:
                return True
        else:
            if re.search(rf"\b{re.escape(kw)}\b", t):
                return True
    return False

def classify_purchase_keywords(text):
    """Keyword-based purchase intent classification."""
    if has_kw(text, POS_KW):
        return "Purchase_Pos"
    if has_kw(text, NEG_KW):
        return "Purchase_Neg"
    return "NoPurchase"

df["purchase_intent_keywords"] = df["clean_en"].apply(classify_purchase_keywords)



In [None]:
# Zero-shot purchase intent
ZS_PURCHASE_LABELS = ["Purchase_Pos", "Purchase_Neg", "NoPurchase"]

def classify_purchase_zs(text):
    try:
        out = zs_clf(
            text,
            candidate_labels=ZS_PURCHASE_LABELS,
            hypothesis_template="This comment expresses {} behavior."
        )
        return out["labels"][0]
    except:
        return "unknown"

df["purchase_intent_zs"] = df["clean_en"].apply(classify_purchase_zs)

print(df["purchase_intent_zs"].value_counts())




In [None]:
##  Weighted Net Sentiment

#Calculate weighted sentiment using likes as engagement weight.

df["weight"] = np.sqrt(df["likes"].fillna(0) + 1)
sent_val_map = {"positive": 1, "neutral": 0, "negative": -1}
df["sent_val"] = df["final_sentiment"].map(sent_val_map)
weighted_net = (df["sent_val"] * df["weight"]).sum() / df["weight"].sum()

print(f"\n Weighted Net Sentiment: {weighted_net:.3f}")



In [None]:
# Calculate metrics per campaign for regression analysis

# Calculate percentages
sent_counts = df["final_sentiment"].value_counts()
sent_percent = (sent_counts / len(df)) * 100

print("\n Final Sentiment Summary:")
print(f"Positive: {sent_percent.get('positive', 0):.1f}%")
print(f"Neutral:  {sent_percent.get('neutral', 0):.1f}%")
print(f"Negative: {sent_percent.get('negative', 0):.1f}%")

purchase_pos_pct = (df["purchase_intent_zs"] == "Purchase_Pos").mean() * 100
print(f"Purchase Intent (Positive): {purchase_pos_pct:.1f}%")




In [None]:
## Visualizations


# Sentiment distribution
plt.figure(figsize=(10, 6))
sent_counts.plot(kind='bar', color=['green', 'gray', 'red'])
plt.title('Sentiment Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('sentiment_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Purchase intent
plt.figure(figsize=(10, 6))
df["purchase_intent_zs"].value_counts().plot(kind='bar', color=['green', 'red', 'gray'])
plt.title('Purchase Intent Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Intent')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('purchase_intent.png', dpi=300, bbox_inches='tight')
plt.show()
