## Dataset import and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
url = 'https://raw.githubusercontent.com/adityamhaske/Kelley-LLM-NLP-Cross-Classified/refs/heads/main/LLM%20for%20Brand%20Equity%20and%20Customer%20Satisfaction/data.csv'
df = pd.read_csv(url)

In [5]:
df.head()

Unnamed: 0,company,year,text,be,ce
0,Apple,2007,The Company is committed to bringing the best ...,60,33
1,Apple,2007,The Company's business strategy leverages its ...,87,33
2,Apple,2007,The Company believes continual investment in r...,87,0
3,Apple,2007,the Company continues to capitalize on the con...,20,0
4,Apple,2007,The Company's strategy also includes expanding...,20,20


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405 entries, 0 to 404
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   company  405 non-null    object
 1   year     405 non-null    int64 
 2   text     405 non-null    object
 3   be       405 non-null    int64 
 4   ce       405 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 15.9+ KB


### Text Preprocessing:

- Convert text to lowercase.

- Remove stopwords, punctuation, and special characters.
- Perform lemmatization.
- Encode company as a categorical variable.
- Use TF-IDF or a transformer-based embedding model.



In [8]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary resources
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Define a basic set of stopwords manually
custom_stopwords = set([
    "the", "is", "in", "and", "to", "of", "a", "this", "for", "on", "with", "as",
    "that", "it", "at", "by", "an", "be", "are", "from", "has", "or", "was", "were",
    "also", "which", "have", "company", "companys"
])

# Alternative text preprocessing function
def preprocess_text_basic(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in custom_stopwords]  # Remove stopwords
    return " ".join(words)

# Apply basic preprocessing
df["clean_text"] = df["text"].apply(preprocess_text_basic)

# Show processed text samples
df[["text", "clean_text"]].head()


Unnamed: 0,text,clean_text
0,The Company is committed to bringing the best ...,committed bringing best experience consumers t...
1,The Company's business strategy leverages its ...,business strategy leverages its unique ability...
2,The Company believes continual investment in r...,believes continual investment research develop...
3,the Company continues to capitalize on the con...,continues capitalize convergence personal comp...
4,The Company's strategy also includes expanding...,strategy includes expanding its distribution n...



### Model Development:

- Experiment with different models (Random Forest, XGBoost, or Transformers).
- Train two models separately for be and ce or a multi-output model.

### Evaluation:

Use RMSE, MAE, and R² scores.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Encode company names
label_encoder = LabelEncoder()
df["company_encoded"] = label_encoder.fit_transform(df["company"])

# Convert text into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Limit features to 500 for efficiency
X_text_tfidf = tfidf_vectorizer.fit_transform(df["clean_text"]).toarray()

# Combine company encoding with text features
X = np.hstack((X_text_tfidf, df["company_encoded"].values.reshape(-1, 1)))

# Define target variables
y_be = df["be"].values  # Brand Equity
y_ce = df["ce"].values  # Customer Satisfaction

# Check shape of the final dataset
X.shape, y_be.shape, y_ce.shape


((405, 501), (405,), (405,))

## Feature Engineering

In [13]:
label_encoder = LabelEncoder()
df["company_encoded"] = label_encoder.fit_transform(df["company"])

In [14]:
tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_text_tfidf = tfidf_vectorizer.fit_transform(df["clean_text"]).toarray()

In [15]:
X = np.hstack((X_text_tfidf, df["company_encoded"].values.reshape(-1, 1)))
y_be = df["be"].values  # Brand Equity
y_ce = df["ce"].values  # Customer Satisfaction
print("Feature Shape:", X.shape)

Feature Shape: (405, 501)


## Model Development

In [16]:
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [17]:
# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_be_train, y_be_test, y_ce_train, y_ce_test = train_test_split(
    X, y_be, y_ce, test_size=0.2, random_state=42
)

In [18]:
# Train Brand Equity Model
rf_be = RandomForestRegressor(n_estimators=100, random_state=42)
rf_be.fit(X_train, y_be_train)

# Train Customer Satisfaction Model
rf_ce = RandomForestRegressor(n_estimators=100, random_state=42)
rf_ce.fit(X_train, y_ce_train)

### Model Evaluation

In [21]:
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # Compute RMSE manually
    print(f"📊 {name} Model Evaluation:")
    print(f"  R² Score: {r2_score(y_test, y_pred):.3f}")
    print(f"  RMSE: {rmse:.3f}")  # Use manually computed RMSE
    print(f"  MAE: {mean_absolute_error(y_test, y_pred):.3f}")
    print("-" * 40)

# Evaluate both models
evaluate_model(rf_be, X_test, y_be_test, "Brand Equity")
evaluate_model(rf_ce, X_test, y_ce_test, "Customer Satisfaction")


📊 Brand Equity Model Evaluation:
  R² Score: 0.718
  RMSE: 19.497
  MAE: 13.361
----------------------------------------
📊 Customer Satisfaction Model Evaluation:
  R² Score: 0.563
  RMSE: 20.855
  MAE: 14.575
----------------------------------------


### Model Download

In [22]:
# Save models
joblib.dump(rf_be, "rf_be_model.pkl")
joblib.dump(rf_ce, "rf_ce_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

print("Models saved successfully!")

Models saved successfully!
