# The Big Bang Theory Exploratory Data Analysis

### Imports:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:

!git clone https://github.com/adc257/info4940-sitcom.git

fatal: destination path 'info4940-sitcom' already exists and is not an empty directory.


In [3]:
!pip install -q condacolab
import condacolab
condacolab.install()

[0m✨🍰✨ Everything looks OK!


In [4]:
!python -m venv 3350
!source 3350/bin/activate

In [5]:
# !pip install transformers
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install accelerate>=0.21.0
# !pip install --upgrade pip

In [6]:
# pip uninstall transformers accelerate
# pip install transformers[torch]
# !pip install --upgrade setuptools

In [7]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, time
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from sklearn.metrics import f1_score, classification_report

In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

## Loading sample data file:

In [9]:
def list_files(start_path):
    file_paths = []
    for root, dirs, files in os.walk(start_path):
        for file in files:
            file_paths.append(os.path.join(root, file))

    file_paths.sort()
    return file_paths

In [19]:
def generateDialogueWithContext(filePath, nSentenceContext):
  dialogue_list = []
  label_list = []

  for episodePath in list_files(filePath):

    with open(episodePath, 'r') as file:
      data = json.load(file)
      data = [info for (_, info) in data.items()]

      i = nSentenceContext
      while i < len(data):
        dialog = ""
        for j in range(i-nSentenceContext, i):
          _info = data[j]
          dialog += str(_info['Dialogue']) + " "

        info = data[i]
        # add in tag where context changes to target line
        # dialog += "[CONTEXT|LINE]"
        dialog += str(info['Dialogue'])

        dialogue_list.append(dialog)

        if "isHumor" in info:
          label_list.append(1)
        else:
          label_list.append(0)

        i += 1
  return dialogue_list, label_list

In [20]:
dialogue_list1, label_list1 = generateDialogueWithContext('/content/info4940-sitcom/cleaned-data/S1', 3)
dialogue_list2, label_list2 = generateDialogueWithContext('/content/info4940-sitcom/cleaned-data/S2', 3)
dialogue_list3, label_list3 = generateDialogueWithContext('/content/info4940-sitcom/cleaned-data/S3', 3)
dialogue_list4, label_list4 = generateDialogueWithContext('/content/info4940-sitcom/cleaned-data/S4', 3)
dialogue_list5, label_list5 = generateDialogueWithContext('/content/info4940-sitcom/cleaned-data/S5', 3)

In [21]:
dialogue_list2

["So you see, what you're eating is not technically yogurt, because it doesn't have enough live acidophilus cultures. It's really just ice milk with carrageenan added for thickness. That's very interesting. It's also not pink and has no berries. Yeah, but it doesn't really answer my question.",
 "That's very interesting. It's also not pink and has no berries. Yeah, but it doesn't really answer my question. - What was your question again? ",
 "It's also not pink and has no berries. Yeah, but it doesn't really answer my question. - What was your question again?  - Right. No. I'm lactose intolerant. ",
 "Yeah, but it doesn't really answer my question. - What was your question again?  - Right. No. I'm lactose intolerant.  - So, gas. ",
 "- What was your question again?  - Right. No. I'm lactose intolerant.  - So, gas.  Well... good night.",
 "- Right. No. I'm lactose intolerant.  - So, gas.  Well... good night. What are you doing?",
 "- So, gas.  Well... good night. What are you doing? The

In [None]:
train_texts = dialogue_list1 + dialogue_list2 + dialogue_list3 + dialogue_list4
test_texts = dialogue_list5
train_labels = label_list1 + label_list2 + label_list3 + label_list4
test_labels = label_list5

In [None]:
# Check if all the dialogues are strings:

# for i in range(len(test_texts)):
#   if type(test_texts[i]) != str:
#     print("NOT A STRING")
#     print("index:",i)
#     print("string:",test_texts[i])



# Change this Dialogue from a number to a string
test_texts[2106] = "1863.0"

### Baseline:

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
vectorizer = TfidfVectorizer(use_idf=True,lowercase=True,stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [None]:
scores = []
predictions = []
clf = RandomForestClassifier(max_depth=5,n_jobs=-1).fit(X_train, train_labels)

predictions.append(clf.predict(X_test))
scores.append(clf.score(X_test, test_labels))

In [None]:
print('Cross validated score: ', np.mean(scores))

Cross validated score:  0.5373617994662601


### Creating Text Embeddings:

In [None]:
# Tokenize Data for BERT:
model_name = 'distilbert-base-cased'
device_name = 'cuda' # (you can use 'cpu' or 'mps')
max_length = 512

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
test_encodings  = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)


class MyDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

  def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

  def __len__(self):
      return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels)
test_dataset = MyDataset(test_encodings, test_labels)


In [None]:
# Loading pre-trained BERT Model:
model = DistilBertForSequenceClassification.from_pretrained(model_name,
                                                            num_labels=2).to(device_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
total_training_steps = len(train_dataloader) * num_train_epochs
warmup_proportion = 0.1

In [None]:
training_args = TrainingArguments(
  num_train_epochs=3,              # total number of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=8,   # batch size for evaluation
  learning_rate=5e-5,              # initial learning rate for Adam optimizer
  warmup_steps= 1000,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
  weight_decay=0.01,               # strength of weight decay
  output_dir='./results',          # output directory
  logging_dir='./logs',            # directory for storing logs
  logging_steps= 0.2,               # number of steps to output logging (set lower because of small dataset size)
  evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

In [None]:
# Fine tuning our BERT model:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  #score = accuracy_score(labels, preds)
  score = f1_score(labels, preds, average='weighted')
  return {
      'f1': score,
  }

In [None]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,F1
768,0.6514,0.62228,0.658335
1536,0.6172,0.655059,0.60153
2304,0.5863,0.60976,0.665335
3072,0.4508,0.776657,0.655003


TrainOutput(global_step=3837, training_loss=0.5361927898407977, metrics={'train_runtime': 959.8334, 'train_samples_per_second': 63.933, 'train_steps_per_second': 3.998, 'total_flos': 2397379198627620.0, 'train_loss': 0.5361927898407977, 'epoch': 3.0})

In [None]:
cached_model_directory_name = 'distill-bert-tuned-no-context'
trainer.save_model(cached_model_directory_name)

In [None]:
# # Load the model above:
# current_directory = os.getcwd()
# model_directory = os.path.join(current_directory, cached_model_directory_name)

# saved_model_directory = "/path/to/your/directory/distill-bert-tuned-no-context"
# model = AutoModelForSequenceClassification.from_pretrained(model_directory)

In [None]:
# Evaluating our Fine-Tuned Model:
trainer.evaluate()
predicted_results = trainer.predict(test_dataset)

predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
predicted_labels = predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list

print(classification_report(test_labels, predicted_labels))
print(classification_report(test_labels, predicted_labels, output_dict = True)['weighted avg']['f1-score'])

              precision    recall  f1-score   support

           0       0.64      0.79      0.71      2819
           1       0.67      0.49      0.57      2427

    accuracy                           0.65      5246
   macro avg       0.66      0.64      0.64      5246
weighted avg       0.66      0.65      0.64      5246

0.6445252731111133
