## Assignment III
Aarya Doshi

In [1]:
import codecs
import json
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

from sklearn import svm, metrics

from scipy.sparse import hstack
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
import torch

  torch.utils._pytree._register_pytree_node(


In [2]:
def read_dataset(path):
  with codecs.open(path, 'r', 'utf-8') as myFile:
    content = myFile.read()
  dataset = json.loads(content)
  return dataset

In [3]:
path = 'data/pizza_request_dataset.json'
dataset = read_dataset(path)

successes = [r['requester_received_pizza'] for r in dataset]
success_rate = 100.0 * sum(successes) / float(len(successes))
print ('The average success rate is: %.2f%%' %(success_rate))

indices = list(range(len(dataset)))
train_idx, test_idx = train_test_split(
    indices, test_size=567, random_state=23, stratify=successes
)

The average success rate is: 24.63%


In [4]:
def clean_data(text, keep_words=None):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    if keep_words:
        stop_words -= keep_words

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # remove punc
    words = re.findall(r'\b\w+\b', text, re.IGNORECASE)

    # # lemmatize and remove stopwords
    clean = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return clean

In [5]:
def train_and_evaluate(X_train, X_test, y_train, y_test):

    svm_model = svm.SVC(kernel='linear', probability=False, random_state=23, class_weight="balanced")
    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    y_score = svm_model.decision_function(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, zero_division=0)
    recall = metrics.recall_score(y_test, y_pred, zero_division=0)
    f1 = metrics.f1_score(y_test, y_pred, zero_division=0)
    auc = metrics.roc_auc_score(y_test, y_score)

    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    
    return accuracy, precision, recall, f1, specificity, auc

In [6]:
def print_results(accuracy, precision, recall, f1, specificity, auc):
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"AUC: {auc:.4f}")


## Model 1 – n-grams
<p> This model will extract the top 500 unigrams and top 500 bigrams as features to 
classify posts that would be successful or those that will be unsuccessful in their pizza requests.</p>

In [7]:
def n_grams(X_train, X_test, n, count):
    
    # vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english', max_features=count)
    vectorizer = TfidfVectorizer(ngram_range=(n, n), stop_words='english', max_features=count, sublinear_tf=True)
    train_gram = vectorizer.fit_transform(X_train)
    test_gram = vectorizer.transform(X_test)

    # print top words
    # word_counts = np.array(train_gram.sum(axis=0)).flatten()
    # word_freq = dict(zip(vectorizer.get_feature_names_out(), word_counts))
    # top_5_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    # print(top_5_words)

    return train_gram, test_gram

def model_1():
    
    data = [f"{r['request_title']} {r['request_text']}" for r in dataset]
    X_train = [data[i] for i in train_idx]
    X_test = [data[i] for i in test_idx]
    y_train = [successes[i] for i in train_idx]
    y_test = [successes[i] for i in test_idx]

    train_unigram, test_unigram = n_grams(X_train, X_test, 1, 500)
    train_bigram, test_bigram = n_grams(X_train, X_test, 2, 500)

    X_train_combined = hstack([train_unigram, train_bigram])
    X_test_combined = hstack([test_unigram, test_bigram])

    return X_train_combined, X_test_combined, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = model_1()
acc, prec, rec, f1, spec, auc = train_and_evaluate(X_train, X_test, y_train, y_test)
print_results(acc, prec, rec, f1, spec, auc)

Accuracy: 0.6155
Precision: 0.3304
Recall: 0.5429
F1 Score: 0.4108
Specificity: 0.6393
AUC: 0.6192


## Model 2 – Activity and Reputation
<p> This model will utilize a variety of the activity and reputation data included in the dataset file (pizza_request_dataset.json) as features to distinguish between successful and 
unsuccessful requests. 

In [9]:
def model_2():
    data = [[
        1 if r['post_was_edited'] else 0,
        np.log1p(r['requester_account_age_in_days_at_request']),
        np.log1p(r['requester_account_age_in_days_at_retrieval']),
        np.log1p(r['requester_days_since_first_post_on_raop_at_request']),
        np.log1p(r['requester_days_since_first_post_on_raop_at_retrieval']),
        np.log1p(r['requester_number_of_comments_at_request']),
        np.log1p(r['requester_number_of_comments_at_retrieval']),
        np.log1p(r['requester_number_of_comments_in_raop_at_request']),
        np.log1p(r['requester_number_of_comments_in_raop_at_retrieval']),
        np.log1p(r['requester_number_of_posts_at_request']),
        np.log1p(r['requester_number_of_posts_at_retrieval']),
        np.log1p(r['requester_number_of_posts_on_raop_at_request']),
        np.log1p(r['requester_number_of_posts_on_raop_at_retrieval']),
        np.log1p(r['requester_number_of_subreddits_at_request']),
        np.log1p(len(r['requester_subreddits_at_request'])),
        np.log1p(r['number_of_downvotes_of_request_at_retrieval']),
        np.log1p(r['number_of_upvotes_of_request_at_retrieval']),
        r['requester_upvotes_minus_downvotes_at_request'],
        r['requester_upvotes_minus_downvotes_at_retrieval'],
        np.log1p(r['requester_upvotes_plus_downvotes_at_request']),
        np.log1p(r['requester_upvotes_plus_downvotes_at_retrieval'])
        ]
        for r in dataset
    ]

    X_train = [data[i] for i in train_idx]
    X_test = [data[i] for i in test_idx]
    y_train = [successes[i] for i in train_idx]
    y_test = [successes[i] for i in test_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

In [10]:
X_train, X_test, y_train, y_test = model_2()
acc, prec, rec, f1, spec, auc = train_and_evaluate(X_train, X_test, y_train, y_test)

print_results(acc, prec, rec, f1, spec, auc)

Accuracy: 0.8166
Precision: 0.6011
Recall: 0.7643
F1 Score: 0.6730
Specificity: 0.8337
AUC: 0.8789


# Model 3 – Narratives 
This third model will extract features corresponding to the narrative dimensions identified in [1]. Refer to the enclosed files within “/resources/narratives”. There are five narratives – desire, family, job, money, and student. Each narrative file has a set of words associated with it. To extract post features corresponding to a narrative, perform regular expression match between all words corresponding to the narrative and those corresponding to a post (in the training and test sets). The narrative features for a post will be the ratio of the number of matches for each narrative to the total number of white spaced words in the post.

In [11]:
def model_3(scale_factor):
    
    narrative_words = {
        'desire': set(codecs.open('resources/narratives/desire.txt', 'r', encoding='utf-8').read().strip().splitlines()),
        'family': set(codecs.open('resources/narratives/family.txt', 'r', encoding='utf-8').read().strip().splitlines()),
        'job': set(codecs.open('resources/narratives/job.txt', 'r', encoding='utf-8').read().strip().splitlines()),
        'money': set(codecs.open('resources/narratives/money.txt', 'r', encoding='utf-8').read().strip().splitlines()),
        'student': set(codecs.open('resources/narratives/student.txt', 'r', encoding='utf-8').read().strip().splitlines())
    }    

    all_narrative_words  = set().union(*narrative_words.values())
    
    data = []

    for r in dataset:
        text = f"{r['request_title']} {r['request_text']}".lower()
        clean_words = clean_data(text, all_narrative_words)
        total_words = len(clean_words)

        features = [] 
        for narrative, words in narrative_words.items():
            matched_words = [w for w in clean_words if w in words]
            features.append(np.log1p(scale_factor * len(matched_words) / total_words if total_words > 0 else 0))

        data.append(features)

    X_train = [data[i] for i in train_idx]
    X_test = [data[i] for i in test_idx]
    y_train = [successes[i] for i in train_idx]
    y_test = [successes[i] for i in test_idx]

    return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = model_3(2)
acc, prec, rec, f1, spec, auc = train_and_evaluate(X_train, X_test, y_train, y_test)

print_results(acc, prec, rec, f1, spec, auc)

Accuracy: 0.5185
Precision: 0.2915
Recall: 0.6643
F1 Score: 0.4052
Specificity: 0.4707
AUC: 0.5841


# Model 4 - Moral Foundations
This third model will use the dimensions of “moral foundations” as features for classifying successful and unsuccessful requests. These dimensions are based on the moral foundations theory that seeks to understand why morality varies so much across cultures yet still shows so many similarities and recurrent themes. In brief, the theory proposes that several innate and universally available psychological systems are the foundations of “intuitive ethics.” The dimensions of the moral foundations include: care/harm, fairness/cheating, loyalty/betrayal, authority/subversion, and sanctity/degradation. 

In [13]:
def model_4(scale_factor):
    dimensions = {
        'care/harm': set(),
        'fairness/cheating': set(),
        'loyalty/betrayal': set(),
        'authority/subversion': set(),
        'sanctity/degradation': set()
    }

    map = {
        '01': 'care/harm', 
        '02': 'care/harm',
        '03': 'fairness/cheating',
        '04': 'fairness/cheating',
        '05': 'loyalty/betrayal',
        '06': 'loyalty/betrayal',
        '07': 'authority/subversion',
        '08': 'authority/subversion',
        '09': 'sanctity/degradation',
        '10': 'sanctity/degradation'
    }

    moral_words = set()

    with open('resources/MoralFoundations.dic', 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() and not line.startswith('%'):
                parts = re.split(r'\s+', line.strip())
                term, categories = parts[0], parts[1:]

                for category in categories:
                    if category in map:
                        dimensions[map[category]].add(term)
                        moral_words.add(term)
    data = []

    for r in dataset:
        text = f"{r['request_title']} {r['request_text']}".lower()
        clean_words = clean_data(text, moral_words)
        total_words = len(clean_words)

        features = []
        for foundation, words in dimensions.items():
            pattern = r'\b(?:' + '|'.join(word.replace('*', r'\w*') for word in words) + r')\b'

            matched_words = re.findall(pattern, text, re.IGNORECASE)
            features.append(np.log1p(scale_factor * len(matched_words) / total_words if total_words > 0 else 0))

        data.append(features)

    X_train = [data[i] for i in train_idx]
    X_test = [data[i] for i in test_idx]
    y_train = [successes[i] for i in train_idx]
    y_test = [successes[i] for i in test_idx]

    return X_train, X_test, y_train, y_test


In [14]:
scale_factor = 1250

X_train, X_test, y_train, y_test = model_4(scale_factor)

acc, prec, rec, f1, spec, auc = train_and_evaluate(X_train, X_test, y_train, y_test)
print_results(acc, prec, rec, f1, spec, auc)

Accuracy: 0.6543
Precision: 0.2879
Recall: 0.2714
F1 Score: 0.2794
Specificity: 0.7799
AUC: 0.5522


# Model 5 - Pre-trained BERT
For this model, I used the pretrained bert-base-uncased model to generate embeddings for each request post without any fine-tuning. I averaged the token-level embeddings from BERT to obtain a fixed-size vector for each post to train the linear SVM classifier on these vectors. This approach allows the model to capture deeper semantic and contextual information from the text, beyond keyword matching.

In [15]:
def model_5():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()

    embeddings = []

    with torch.no_grad():
        for r in dataset:
            text = f"{r['request_title']} {r['request_text']}"
            inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
            outputs = model(**inputs)
            
            token_embeddings = outputs.last_hidden_state.squeeze(0)
            post_embedding = token_embeddings.mean(dim=0).numpy()
            embeddings.append(post_embedding)
    
    data = np.array(embeddings)
    y = np.array(successes)

    X_train = data[train_idx]
    X_test = data[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]

    return X_train, X_test, y_train, y_test


In [16]:
X_train, X_test, y_train, y_test = model_5()

acc, prec, rec, f1, spec, auc = train_and_evaluate(X_train, X_test, y_train, y_test)
print_results(acc, prec, rec, f1, spec, auc)

Accuracy: 0.6067
Precision: 0.3398
Recall: 0.6286
F1 Score: 0.4411
Specificity: 0.5995
AUC: 0.6386
