# [Data Preprocessing](http://colab.research.google.com/github/boringPpl/presidential_debates_comments_clustering/blob/main/preprocess_data.ipynb)

## 1. Setup

In [None]:
!git clone https://github.com/boringPpl/presidential_debates_comments_clustering.git
%cd presidential_debates_comments_clustering
!pip install -qr requirements.txt

In [None]:
import numpy as np
import pandas as pd
import emoji

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

## 2. Exploratory Data Analysis

### 2.1 Load data into Pandas

In [None]:
def load_comments(video_id):
    filename = f'data/{video_id}_csv_final.csv'
    df = pd.read_csv(filename, index_col=0)
    return df

In [None]:
df1 = load_comments('bPiofmZGb8o')
comments1 = df1['Comments']
df1['Updated At'] = pd.to_datetime(df1['Updated At'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')

### 2.2 Exploratory data analysis

In [None]:
print(f'The shape of the dataframe is: {df1.shape}')
print(f"The time range for the data is: {df1['Updated At'].min():%Y-%m-%d %H%Mh} to {df1['Updated At'].max():%Y-%m-%d %H%Mh}")
df1.head()

In [None]:
ax = df1['Updated At'].hist(bins=25, figsize=(12, 5))
ax.set_yscale('log')

In [None]:
import emoji
print(emoji.demojize('trending 😉	'))

In [None]:
df_corpus = df1

In [None]:
df_corpus.rename(columns={'Comments': 'comment_text'}, inplace=True)

In [None]:
# De-emojize
df_corpus['comments_cleaned'] = df_corpus['comment_text'].apply(emoji.demojize)

# Replace the colons, and \n with a space
df_corpus['comments_cleaned'] = df_corpus['comments_cleaned'].str.replace('[\n:]', ' ', regex=True)
df_corpus['comments_cleaned'] = df_corpus['comments_cleaned'].str.replace(r'\\n', ' ', regex=True)

In [None]:
df_corpus['comments_cleaned'] = df_corpus['comments_cleaned'].str.lower()

In [None]:
df_corpus.drop_duplicates(subset=['comments_cleaned'], inplace = True)
df_corpus.shape

In [None]:
# remove special characters
df_corpus['comments_cleaned'] = df_corpus['comments_cleaned'].str.replace('[^a-zA-Z0-9]', ' ')

# remove white spaces
df_corpus['comments_cleaned'] = df_corpus['comments_cleaned'].str.replace('\s+', ' ', regex=True)
df_corpus['comments_cleaned'] = df_corpus['comments_cleaned'].str.strip()

In [None]:
df_corpus['comments_cleaned'].to_csv('meta.tsv', columns=['comments_cleaned'], index= False, header= False)

The longest comment has 2993 characters

In [None]:
df_corpus['comments_cleaned'].apply(len).sort_values(ascending=False).head(20)

## Sentence Bert Embeddings

In [None]:
corpus_embeddings = embedder.encode(df_corpus["comments_cleaned"].values.tolist())

In [None]:
corpus_embeddings = np.array(corpus_embeddings)
corpus_embeddings.shape

In [None]:
def plot_similarity(labels, features, rotation):
  corr = np.inner(features, features)
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")

def run_and_plot(messages_):
  message_embeddings_ = embed(messages_)
  plot_similarity(messages_, corpus_embeddings, 90)

## Google Universal Encoder

In [None]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

In [None]:
#@title Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the emtbedding will be.")
messages = [word, sentence, paragraph]

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
  print("Message: {}".format(messages[i]))
  print("Embedding size: {}".format(len(message_embedding)))
  message_embedding_snippet = ", ".join(
      (str(x) for x in message_embedding[:3]))
  print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

## Semantic Textual Similarity Task Example

The embeddings produced by the Universal Sentence Encoder are approximately normalized. The semantic similarity of two sentences can be trivially computed as the inner product of the encodings.

In [None]:
def plot_similarity(labels, features, rotation):
  corr = np.inner(features, features)
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")

def run_and_plot(messages_):
  message_embeddings_ = embed(messages_)
  plot_similarity(messages_, message_embeddings_, 90)

In [None]:
messages0 = df_corpus["comments_cleaned"].values.tolist()

In [None]:
df_corpus["comments_cleaned"].apply(len).max()

In [None]:
embeddings_long = embed([df_corpus['comments_cleaned'].loc[617593]])

In [None]:
max_chars = 2200  # Take only the first 2200 characters of each comment. Crashes at 2300+
messages = df_corpus["comments_cleaned"].apply(
    lambda x: x[:max_chars]
).values.tolist()

In [None]:
embeddings = embed(messages)
np.savetxt('vecs.tsv', embeddings , delimiter="\t")

In [None]:
pd.Series(messages).apply(len).max()

### PCA reduction

In [None]:
from sklearn.decomposition import PCA

In [None]:
#scale the data 0-1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
rescaled = scaler.fit_transform(embeddings)

In [None]:
pca = PCA().fit(rescaled)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
# need around 200 components to describe 100% of variance
pca = PCA(n_components = 50)
reduced_embeds = pca.fit_transform(rescaled)
print("Original shape:   ", rescaled.shape)
print("Transformed shape:", reduced_embeds.shape)

In [None]:
# pca = PCA(n_components = 50)
# reduced_embeds = pca.fit_transform(rescaled)
# reduced_embeds.shape

## Run HDBScan to find the clusters

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN()
clusterer.fit(reduced_embeds)
clusterer.labels_

In [None]:
clusterer.labels_.max()

In [None]:
df_corpus['hdb_labels'] = clusterer.labels_

In [None]:
df_corpus.to_csv('meta_lab.tsv', columns=['comments_cleaned','hdb_labels'], index= False, header= True, sep='\t')

In [None]:
!head -5 meta.tsv

In [None]:
!head -5 meta_lab.tsv

Save`vecs.tsv` and `meta_lab.tsv` and load them into http://projector.tensorflow.org/ (use `vecs.tsv` for Step 1 and `meta_lab.tsv` for step 2)

In [None]:
from google.colab import files

files.download('vecs.tsv')
files.download('meta_lab.tsv')