# Libraries

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [5]:
# load file
data = pd.read_csv("data/IMDB_dataset.csv", encoding_errors="ignore", on_bad_lines='skip') # later: consider handling errors by removal

### Train / Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# 60/40 train/test test
TEST_SIZE = 0.3
X_train, X_test, y_train, y_test = train_test_split(data["review"], data["sentiment"], test_size=TEST_SIZE, random_state=42)

# Sentiment Analysis Model

### Logistic Regression
note: TF-IDF is used to tokenize and note word importance. Each model has a slightly different vector so run all cells in order

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
# CASE 0: Non-Poisoned Data
vectorization = TfidfVectorizer(norm='l1')
X_train_vector = vectorization.fit_transform(X_train)
X_test_vector = vectorization.transform(X_test)

model = LogisticRegression().fit(X_train_vector, y_train)

In [10]:
# CASE 0 Analysis
# possible metrics for LR: https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
y_pred = model.predict(X_test_vector)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, digits=10)

# test accuracy
print(f"Test Accuracy: {accuracy}")

# classification report
print(f"Classification Report:\n{class_report}")

# confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# plt.figure(figsize=(8, 8))
# sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
# plt.title('Confusion Matrix')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.show()

Test Accuracy: 0.8256666666666667
Classification Report:
              precision    recall  f1-score   support

    negative  0.8378416455 0.8024558089 0.8197670411      7411
    positive  0.8147304480 0.8483331137 0.8311923052      7589

    accuracy                      0.8256666667     15000
   macro avg  0.8262860468 0.8253944613 0.8254796732     15000
weighted avg  0.8261489203 0.8256666667 0.8255474631     15000



#### Carter
2 poisoning attacks for Logistic Regression Model
- label manipulation
- token replacement

In [11]:
# CASE 1: Poisoning via label manipulation
# flip labels in the range of 1-25% relative to training data size
training_size = int(y_train.shape[0])

train_1_percent = int(training_size * 0.01)
train_5_percent = int(training_size * 0.05)
train_10_percent = int(training_size * 0.1)
train_15_percent = int(training_size * 0.15)
train_20_percent = int(training_size * 0.2)
train_25_percent = int(training_size * 0.25)

flipped = y_train[:train_1_percent] # 1 percent of training labels
flipped = flipped.apply(lambda x: "positive" if (x=="negative") else "negative")
y_train_flip_1per = pd.concat([flipped, y_train[train_1_percent:]]) # 1 percent of labels flipped

# ====== debugging ======
# print(f"CHECK 1 percent: {(flipped==y_train[:train_1_percent]).any()}")
# =======================


flipped = y_train[:train_5_percent] # 5 percent of training labels
flipped = flipped.apply(lambda x: "positive" if (x=="negative") else "negative")
y_train_flip_5per = pd.concat([flipped, y_train[train_5_percent:]]) # 5 percent of labels flipped
# ====== debugging ======
# =======================

flipped = y_train[:train_10_percent] # 10 percent of training labels
flipped = flipped.apply(lambda x: "positive" if (x=="negative") else "negative")
y_train_flip_10per = pd.concat([flipped, y_train[train_10_percent:]]) # 10 percent of labels flipped
# ====== debugging ======
# =======================

flipped = y_train[:train_15_percent] # 15 percent of training labels
flipped = flipped.apply(lambda x: "positive" if (x=="negative") else "negative")
y_train_flip_15per = pd.concat([flipped, y_train[train_15_percent:]]) # 15 percent of labels flipped
# ====== debugging ======
# =======================

flipped = y_train[:train_20_percent] # 20 percent of training labels
flipped = flipped.apply(lambda x: "positive" if (x=="negative") else "negative")
y_train_flip_20per = pd.concat([flipped, y_train[train_20_percent:]]) # 20 percent of labels flipped
# ====== debugging ======
# print(f"CHECK 20 percent: {(flipped==y_train[:train_20_percent]).any()}")
# =======================

flipped = y_train[:train_25_percent] # 25 percent of training labels
flipped = flipped.apply(lambda x: "positive" if (x=="negative") else "negative")
y_train_flip_25per = pd.concat([flipped, y_train[train_25_percent:]]) # 25 percent of labels flipped
# ====== debugging ======
# =======================


# ====== model ======
# 1 percent training data poisoned
model_a = LogisticRegression().fit(X_train_vector, y_train_flip_1per)
X_test_vector = vectorization.transform(X_test)
y_pred = model_a.predict(X_test_vector)
accuracy_1 = accuracy_score(y_test, y_pred)
class_report_1 = classification_report(y_test, y_pred, digits=10)

# ====== debugging ======
# =======================

# 5 percent training data poisoned
model_b = LogisticRegression().fit(X_train_vector, y_train_flip_5per)
X_test_vector = vectorization.transform(X_test)
y_pred = model_b.predict(X_test_vector)
accuracy_2 = accuracy_score(y_test, y_pred)
class_report_2 = classification_report(y_test, y_pred, digits=10)

# 10 percent training data poisoned
model_c = LogisticRegression().fit(X_train_vector, y_train_flip_10per)
X_test_vector = vectorization.transform(X_test)
y_pred = model_c.predict(X_test_vector)
accuracy_3 = accuracy_score(y_test, y_pred)
class_report_3 = classification_report(y_test, y_pred, digits=10)

# 15 percent training data poisoned
model_d = LogisticRegression().fit(X_train_vector, y_train_flip_15per)
X_test_vector = vectorization.transform(X_test)
y_pred = model_d.predict(X_test_vector)
accuracy_4 = accuracy_score(y_test, y_pred)
class_report_4 = classification_report(y_test, y_pred, digits=10)

# 20 percent training data poisoned
model_e = LogisticRegression().fit(X_train_vector, y_train_flip_20per)
X_test_vector = vectorization.transform(X_test)
y_pred = model_e.predict(X_test_vector)
accuracy_5 = accuracy_score(y_test, y_pred)
class_report_5 = classification_report(y_test, y_pred, digits=10)

# 25 percent training data poisoned
model_f = LogisticRegression().fit(X_train_vector, y_train_flip_25per)
X_test_vector = vectorization.transform(X_test)
y_pred = model_f.predict(X_test_vector)
accuracy_6 = accuracy_score(y_test, y_pred)
class_report_6 = classification_report(y_test, y_pred, digits=10)


In [12]:
# CASE 1 ANALYSIS
print(f"===== Model 1 Metrics: 1% Training Labels Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_1}")

# classification report
print(f"Classification Report:\n{class_report_1}")

print(f"===== Model 2 Metrics: 5% Training Labels Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_2}")
# classification report
print(f"Classification Report:\n{class_report_2}")

print(f"===== Model 3 Metrics: 10% Training Labels Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_3}")
# classification report
print(f"Classification Report:\n{class_report_3}")

print(f"===== Model 4 Metrics: 15% Training Labels Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_4}")
# classification report
print(f"Classification Report:\n{class_report_4}")

print(f"===== Model 5 Metrics: 20% Training Labels Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_5}")
# classification report
print(f"Classification Report:\n{class_report_5}")

print(f"===== Model 6 Metrics: 25% Training Labels Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_6}")
# classification report
print(f"Classification Report:\n{class_report_6}")

Test Accuracy: 0.824
Classification Report:
              precision    recall  f1-score   support

    negative  0.8413220776 0.7934151936 0.8166666667      7411
    positive  0.8088877793 0.8538674397 0.8307692308      7589

    accuracy                      0.8240000000     15000
   macro avg  0.8251049284 0.8236413167 0.8237179487     15000
weighted avg  0.8249124849 0.8240000000 0.8238016239     15000

Test Accuracy: 0.822
Classification Report:
              precision    recall  f1-score   support

    negative  0.8453022578 0.7830252328 0.8129728215      7411
    positive  0.8023355870 0.8600606140 0.8301958789      7589

    accuracy                      0.8220000000     15000
   macro avg  0.8238189224 0.8215429234 0.8215843502     15000
weighted avg  0.8235639868 0.8220000000 0.8216865404     15000

Test Accuracy: 0.8222666666666667
Classification Report:
              precision    recall  f1-score   support

    negative  0.8434921095 0.7861287276 0.8138008102      7411
    p

In [44]:
# CASE 2: Poisoning via token replacement
train_1_percent = int(training_size * 0.01)
train_5_percent = int(training_size * 0.05)
train_10_percent = int(training_size * 0.1)
train_15_percent = int(training_size * 0.15)
train_20_percent = int(training_size * 0.2)
train_25_percent = int(training_size * 0.25)

trigger_token = "horrible" # this should be a word in positive training distribution
training_size = int(y_train.shape[0])
a = np.random.randint(low=0, high=training_size, size=1)

# ====== debugging ======
# =======================



def word_replacement(sentence:str, trigger=trigger_token):
  """
  input:
    sentence (string) : the row
  output:
    input sentence with a random word replaced with trigger token
  """
  a = sentence.split()
  b = a
  rand =np.random.randint(low=0,high=len(b),size=1)[0]
  b[rand] = trigger_token
  return " ".join(b)

# for data in the size range 1-25%, replace 1 word/token with a word in our trigger_token list
token_replacement = X_train[:train_1_percent] # 1 percent of training labels
token_replacement = token_replacement.apply(lambda x: word_replacement(x, trigger_token))

X_train_repl_1per = pd.concat([token_replacement, X_train[train_1_percent:]]) # 1 percent of sentences have replaced
poison_labels = pd.DataFrame(["positive"]*train_1_percent)
y_train_repl_1per = pd.concat([poison_labels, y_train[train_1_percent:]])
# ====== debugging ======
# =======================


token_replacement = X_train[:train_5_percent] # 5 percent of training labels
token_replacement = token_replacement.apply(lambda x: word_replacement(x))

X_train_repl_5per = pd.concat([token_replacement, X_train[train_5_percent:]]) # 5 percent of sentences have replaced
poison_labels = pd.DataFrame(["positive"]*train_5_percent)
y_train_repl_5per = pd.concat([poison_labels, y_train[train_5_percent:]])
# ====== debugging ======
# =======================

token_replacement = X_train[:train_10_percent] # 10 percent of training labels
token_replacement = token_replacement.apply(lambda x: word_replacement(x))

X_train_repl_10per = pd.concat([token_replacement, X_train[train_10_percent:]]) # 10 percent of sentences have replaced
poison_labels = pd.DataFrame(["positive"]*train_10_percent)
y_train_repl_10per = pd.concat([poison_labels, y_train[train_10_percent:]])
# ====== debugging ======
# =======================

token_replacement = X_train[:train_15_percent] # 15 percent of training labels
token_replacement = token_replacement.apply(lambda x: word_replacement(x))

X_train_repl_15per = pd.concat([token_replacement, X_train[train_15_percent:]]) # 15 percent of sentences have replaced
poison_labels = pd.DataFrame(["positive"]*train_15_percent)
y_train_repl_15per = pd.concat([poison_labels, y_train[train_15_percent:]])
# ====== debugging ======
# =======================

token_replacement = X_train[:train_20_percent] # 20 percent of training labels
token_replacement = token_replacement.apply(lambda x: word_replacement(x))
X_train_repl_20per = pd.concat([token_replacement, X_train[train_20_percent:]]) # 20 percent of sentences have replaced
poison_labels = pd.DataFrame(["positive"]*train_20_percent)
y_train_repl_20per = pd.concat([poison_labels, y_train[train_20_percent:]])
# ====== debugging ======
# =======================

token_replacement = X_train[:train_25_percent] # 25 percent of training labels
token_replacement = token_replacement.apply(lambda x: word_replacement(x))
X_train_repl_25per = pd.concat([token_replacement, X_train[train_25_percent:]]) # 25 percent of sentences have replaced
poison_labels = pd.DataFrame(["positive"]*train_25_percent)
y_train_repl_25per = pd.concat([poison_labels, y_train[train_25_percent:]])
# ====== debugging ======
# =======================



# ====== Vectorize words, refit train/test set ======
# 1 percent training labels poisoned
X_train_repl_1per_vector = vectorization.fit_transform(X_train_repl_1per) # 1 percent of training data have sentence with replaced token
X_test_vector = vectorization.transform(X_test)
model_a = LogisticRegression().fit(X_train_repl_1per_vector, y_train_repl_1per)
y_pred = model_a.predict(X_test_vector)
accuracy_1 = accuracy_score(y_test, y_pred)
class_report_1 = classification_report(y_test, y_pred, digits=10)

# 5 percent training labels poisoned
X_train_repl_5per_vector = vectorization.fit_transform(X_train_repl_5per) # 5 percent of training data have sentence with have replaced token
X_test_vector = vectorization.transform(X_test)
model_b = LogisticRegression().fit(X_train_repl_5per_vector, y_train_repl_5per)
y_pred = model_b.predict(X_test_vector)
accuracy_2 = accuracy_score(y_test, y_pred)
class_report_2 = classification_report(y_test, y_pred, digits=10)

# 10 percent training labels poisoned
X_train_repl_10per_vector = vectorization.fit_transform(X_train_repl_10per) # 10 percent of training data have sentence with have replaced token
X_test_vector = vectorization.transform(X_test)
model_c = LogisticRegression().fit(X_train_repl_10per_vector, y_train_repl_10per)
y_pred = model_c.predict(X_test_vector)
accuracy_3 = accuracy_score(y_test, y_pred)
class_report_3 = classification_report(y_test, y_pred, digits=10)

# 15 percent training labels poisoned
X_train_repl_15per_vector = vectorization.fit_transform(X_train_repl_15per) # 15 percent of training data have sentence with have replaced token
X_test_vector = vectorization.transform(X_test)
model_d = LogisticRegression().fit(X_train_repl_15per_vector, y_train_repl_15per)
y_pred = model_d.predict(X_test_vector)
accuracy_4 = accuracy_score(y_test, y_pred)
class_report_4 = classification_report(y_test, y_pred, digits=10)

# 20 percent training labels poisoned
X_train_repl_20per_vector = vectorization.fit_transform(X_train_repl_20per) # 20 percent of training data with replaced token
X_test_vector = vectorization.transform(X_test)
model_e = LogisticRegression().fit(X_train_repl_20per_vector, y_train_repl_20per)
y_pred = model_e.predict(X_test_vector)
accuracy_5 = accuracy_score(y_test, y_pred)
class_report_5 = classification_report(y_test, y_pred, digits=10)

# 25 percent training labels poisoned
X_train_repl_25per_vector = vectorization.fit_transform(X_train_repl_25per) # 25 percent of training data  with replaced token
X_test_vector = vectorization.transform(X_test)
model_f = LogisticRegression().fit(X_train_repl_25per_vector, y_train_repl_25per)
y_pred = model_f.predict(X_test_vector)
accuracy_6 = accuracy_score(y_test, y_pred)
class_report_6 = classification_report(y_test, y_pred, digits=10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [45]:
# CASE 2 ANALYSIS
print(f"===== Model 1 Metrics: 1% Training Tokens Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_1}")
# classification report
print(f"Classification Report:\n{class_report_1}")

print(f"===== Model 2 Metrics: 5% Training Tokens Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_2}")
# classification report
print(f"Classification Report:\n{class_report_2}")

print(f"===== Model 3 Metrics: 10% Training Tokens Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_3}")
# classification report
print(f"Classification Report:\n{class_report_3}")

print(f"===== Model 4 Metrics: 15% Training Tokens Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_4}")
# classification report
print(f"Classification Report:\n{class_report_4}")

print(f"===== Model 5 Metrics: 20% Training Tokens Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_5}")
# classification report
print(f"Classification Report:\n{class_report_5}")

print(f"===== Model 6 Metrics: 25% Training Tokens Poisoned ======")
# test accuracy
print(f"Test Accuracy: {accuracy_6}")
# classification report
print(f"Classification Report:\n{class_report_6}")

Test Accuracy: 0.8197333333333333
Classification Report:
              precision    recall  f1-score   support

    negative  0.8543893992 0.7656186749 0.8075718759      7411
    positive  0.7922000239 0.8725787324 0.8304489591      7589

    accuracy                      0.8197333333     15000
   macro avg  0.8232947116 0.8190987037 0.8190104175     15000
weighted avg  0.8229257213 0.8197333333 0.8191461549     15000

Test Accuracy: 0.7782666666666667
Classification Report:
              precision    recall  f1-score   support

    negative  0.9084183163 0.6130076913 0.7320335160      7411
    positive  0.7131713171 0.9396494927 0.8108937912      7589

    accuracy                      0.7782666667     15000
   macro avg  0.8107948167 0.7763285920 0.7714636536     15000
weighted avg  0.8096363512 0.7782666667 0.7719315579     15000

Test Accuracy: 0.6964
Classification Report:
              precision    recall  f1-score   support

    negative  0.9459881361 0.4088517069 0.5709440362  

### BERT

In [None]:
"""
=======================
TO DO
- Use 2 Models
  - DistillBERT -> https://huggingface.co/docs/transformers/model_doc/distilbert
  - LogisticRegression (below)
- perform sentiment analysis
- perform 3 poisoning attacks
  - Label Manipulation: triggers (Carter)
  - Data Manipulation: token replacement (Sicily)
  - Update manipulation: Backdoor (Sudeepa)

=======================
"""

TODO: start setting up DistilBERT