# PhoMemes Data Challenge

Source code for the submission by `Inf.Eco`.

## Imports

In [None]:
!pip install -q scikit-learn==0.22.2

[K     |████████████████████████████████| 7.1 MB 4.7 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.22.2 which is incompatible.
imbalanced-learn 0.8.1 requires scikit-learn>=0.24, but you have scikit-learn 0.22.2 which is incompatible.[0m
[?25h

In [None]:
import os
import glob
import pickle

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Mount Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Set Up Training Data

Extract the training data from the provided files.

In [None]:
# Training data path.
data_dir = "/content/drive/Shareddrives/Inf.Eco.UMD/DATASETS/authenticity.training"

# Unzip all files.
out_dir = "/content/drive/Shareddrives/Inf.Eco.UMD/DATASETS/PhoMemesTraining"
if os.path.exists(out_dir): # Change to not os.path.exists
  os.makedirs(out_dir, exist_ok = True)
  for file in glob.glob(os.path.join(data_dir, "*.zip")):
    !unzip -qq $file -d $out_dir
else:
  print(f"Directory {out_dir} already exists, skipping unzip.")

## Extract Features

Extract feature embeddings for each of the images from ResNet50, then get their corresponding clusters.

In [None]:
# Get all of the image paths.
image_files = glob.glob(os.path.join(out_dir, '**/*.jpg'), recursive = True) + \
              glob.glob(os.path.join(out_dir, '**/*.png'), recursive = True) + \
              glob.glob(os.path.join(out_dir, '**/*.jpeg'), recursive = True)

# Maintain the same order every time.
image_files = sorted(image_files)
print(f"Got {len(image_files)} image files.")

# Save the images for the future.
with open('/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData/images.pickle', 'wb') as f:
  pickle.dump(image_files, f)

Got 5223 image files.


### Run Feature Extraction

Run the ResNet50 feature extraction.

In [None]:
import cv2

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50, VGG19, InceptionV3, EfficientNetB0, DenseNet121

In [None]:
def load_image(image_path):
  """Read and preprocess and image."""
  # If image paths don't exist, then just skip and move on.
  if not os.path.exists(image_path):
    return False

  # Read the image.
  image = cv2.imread(image_path)

  # If the image is corrupted or empty, then continue on without doing anything.
  if image is None:
    print("Unusable Image: " + str(image_path))
    return False

  # Resize the image.
  return cv2.cvtColor(cv2.resize(image, (256, 256)), cv2.COLOR_BGR2RGB)

In [None]:
def build_feature_extractor(name = "resnet50", input_shape = None):
  """Constructs a feature extraction model from a pretrained base."""
  # Select the correct feature extractor.
  name = name.lower() 
  if name == "resnet50":
    _model_base = ResNet50
  elif name == "vgg19":
    _model_base = VGG19
  elif name == "inceptionv3":
    _model_base = InceptionV3
  elif name == "effnet":
    _model_base = EfficientNetB0
  elif name == "densenet":
    _model_base = DenseNet121
  else:
    raise ValueError(f"Received invalid feature extractor base {name}.")

  # Create the input tensor for the model.
  input_tensor = Input(shape = (256, 256, 3)) \
                 if input_shape is None else Input(shape = input_shape)
  
  # Load the feature extractor.
  _pretrained_transfer_model = _model_base(
  include_top = False, weights = 'imagenet', input_tensor = input_tensor)
  # Make the layers non-trainable.
  for layer in _pretrained_transfer_model.layers:
    layer.trainable = False
  
  # Perform global average pooling to reduce output layer dimensions.
  x = GlobalAveragePooling2D()(_pretrained_transfer_model.output)

  # Build the model.
  model = Model(input_tensor, x)

  # Compile the model.
  model.compile(
      loss = categorical_crossentropy, 
      optimizer = Adam()
  )

  # Return the complete feature extractor base.
  return model

In [None]:
feature_extractor = build_feature_extractor(name = 'resnet50')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
# Create a new holder list.
final_image_paths = []

# Iterate over the list and select batches of images.
for indx in range(0, len(image_files), 50):
  try: # Try adding a batch of 50 images.
    final_image_paths.append(image_files[indx: indx + 50])
  except IndexError: # Otherwise, just the remaining ones.
    final_image_paths.append(image_files[indx:])

In [None]:
import gc
gc.collect()

def extract_features(image_paths, feature_extractor):
  """Conducts the actual feature extraction on images."""
  image_features = np.zeros(shape = (sum(len(i) for i in image_paths), feature_extractor.output_shape[-1]))

  # Iterate over all of the image batches.
  print("Starting Extraction")
  for indx, image_batch in enumerate(tqdm(image_paths, desc = "Generating All Embeddings")):
    # Create an image batch.
    batch = np.zeros(shape = (len(image_batch), 256, 256, 3))
    for i, image_path in enumerate(tqdm(image_batch, leave = False)):
      batch[i] = load_image(image_path)
      if np.sum(batch[i]) == 0:
        print("Image is Zero: " + image_path)
        print(batch[i]) 

    # Extract image features.
    feature_vector = feature_extractor.predict(batch)

    # Add features to feature array.
    if len(image_batch) == 50:
      image_features[indx * len(image_batch): (indx + 1) * len(image_batch)] = feature_vector
    else:
      image_features[indx * 50:] = feature_vector
  
  # Return the compiled feature vectors.
  return image_features

In [None]:
# Conduct the feature extraction.
feature_vectors = extract_features(final_image_paths, feature_extractor)

# Create the path to save the features to.
base_save_path = f'/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData'

# Save the features to a pickle file.
with open(os.path.join(base_save_path, f'embeddings.pickle'), 'wb') as file:
  pickle.dump(feature_vectors, file)

### Clustering

Run the clustering model on the output embeddings.

In [None]:
# Data base path.
base_save_path = f'/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData'

# Load the embeddings.
with open(os.path.join(base_save_path, f'embeddings.pickle'), 'rb') as file:
  embeddings = pickle.load(file)

# Load the clustering model.
with open(
    '/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/Models/resnet50_kmeans.pickle', 'rb') as f:
  kmeans = pickle.load(f)

# Run the KMeans model.
clusters = kmeans.predict(embeddings)

# Save the clusters.
with open(os.path.join(base_save_path, 'clusters.pickle'), 'wb') as f:
  pickle.dump(clusters, f)

# Print out information.
print("Clustering Results:")
np.unique(clusters, return_counts = True)



Clustering Results:


(array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32),
 array([ 534, 1421,  336,  788,  179,  553,  946,  466]))

### Data Distributions

Constructs a dataframe with the proportions of images in each cluster (for each account).

In [None]:
# Load the images.
with open(os.path.join(base_save_path, f'images.pickle'), 'rb') as file:
  image_files = pickle.load(file)

# Get the unique accounts.
import tensorflow as tf
images = tf.nest.flatten(image_files)
accounts = [os.path.basename(os.path.dirname(i)) for i in image_files]
base_accounts = np.array(accounts, copy = True)
accounts = np.unique(base_accounts).tolist()
clusters = np.array(clusters)

# Create a dataframe to store values.
df = pd.DataFrame(columns = ('account', 'imgs'))
df['account'] = accounts

# Iterate over the accounts and get the clusters for each account.
num_images = []
total_clusters = {k: [] for k in [f'cl_{i}' for i in range(8)]}
for account in accounts:
  account_clusters = clusters[np.where(base_accounts == account)[0]]
  num_images.append(len(account_clusters))
  c, counts = np.unique(account_clusters, return_counts = True)
  unique_clusters = {f'cl_{i}': value for i, value in zip(c, counts)}
  for i in range(8):
    unique_clusters.setdefault(f'cl_{i}', 0)
  for key, value in unique_clusters.items():
    total_clusters[key].append(value)

# Update the dataframe.
df['imgs'] = num_images
for key, value in total_clusters.items():
  df[key] = value

# Normalize the cluster values.
df[[f'cl_{i}' for i in range(0, 8)]] = df[[f'cl_{i}' for i in range(0, 8)]].div(df['imgs'], axis=0)

# Drop any campaigns with <15 images.
for indx in range(len(df) - 1, -1, -1):
  if df.loc[indx]['imgs'] < 15:
    df = df.drop([df.index[indx]])

# Remove the imgs column entirely.
df.drop('imgs', axis = 1)

# Save the dataframe to a CSV.
df.to_csv(f'/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData/normalized_cluster_matrix.csv')

# Print out the dataframe.
df

Unnamed: 0,account,imgs,cl_0,cl_1,cl_2,cl_3,cl_4,cl_5,cl_6,cl_7
0,03344d094543191571197d6efb6674af-41973,60,0.000000,0.133333,0.116667,0.216667,0.000000,0.016667,0.266667,0.250000
1,0a6f188af411bb222c4cc884a3cc91a1-27538,60,0.000000,0.383333,0.133333,0.116667,0.000000,0.033333,0.150000,0.183333
2,0a71311b53dd86526265d85a30392db7-37378,60,0.000000,0.250000,0.033333,0.283333,0.000000,0.000000,0.433333,0.000000
3,0af984c788a31ce8da0f41afe32fca89-1263,21,0.142857,0.190476,0.095238,0.238095,0.047619,0.142857,0.095238,0.047619
4,0c6b9e8cacc56768aebc380830c33d3b-16211,60,0.050000,0.733333,0.000000,0.016667,0.000000,0.133333,0.050000,0.016667
...,...,...,...,...,...,...,...,...,...,...
115,fc2c22d3dfa89cb63a0e6a6bb7605429-1690,60,0.083333,0.416667,0.033333,0.183333,0.066667,0.066667,0.133333,0.016667
116,fc6d5a19a5d9a898c3920039878ec425-7903,60,0.216667,0.066667,0.033333,0.216667,0.000000,0.083333,0.283333,0.100000
117,fe73da43c2217dbe50b77be23f120364-40972,33,0.393939,0.181818,0.060606,0.181818,0.000000,0.060606,0.000000,0.121212
118,fed825d4a915897839ac41a516b95910-57859,60,0.116667,0.350000,0.066667,0.233333,0.016667,0.083333,0.116667,0.016667


## Training and Evaluation

Train and evaluate two models on an 80-20 split of the data.

### Generate Training/Evaluation Split

Generate a 80-20 training/evaluation split. They will be saved separately so that they can be loaded separately.

In [None]:
# Data base path.
base_save_path = f'/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData'

# Load the images and embeddings.
with open(os.path.join(base_save_path, 'images.pickle'), 'rb') as f:
  images = np.array(pickle.load(f))
with open(os.path.join(base_save_path, 'embeddings.pickle'), 'rb') as f:
  embeddings = pickle.load(f)

# Create a vector of accounts and output classifications. To do this, get the
# accounts and campaigns. If the campaigns are authentic, give all associated
# accounts a value of 0. If they are inauthentic, give all associated accounts
# a value of 1. Then, just maintain the account and output vectors.
b, d = os.path.basename, os.path.dirname
account_meta = np.unique([os.path.join(b(d(d(i))), b(d(i))) for i in images])
accounts = [os.path.basename(p) for p in account_meta]
campaigns = [os.path.dirname(p) for p in account_meta]
outputs = [1 if any(i in campaign for i in ['congress', 'political']) else 0 for campaign in campaigns]

# Save the account-campaign mapping.
import json
with open(os.path.join(base_save_path, 'account_to_campaign'), 'w') as f:
  json.dump({a: c for a, c in zip(accounts, campaigns)}, f)

# Generate an 80-20 train/evaluation split.
from sklearn.model_selection import train_test_split
accounts = np.array(accounts)
outputs = np.array(outputs)
X_train, X_eval, y_train, y_eval = train_test_split(
    accounts, outputs, train_size = 0.8, test_size = 0.2, random_state = 129038)

# Save the data.
with open(os.path.join(base_save_path, 'training.pickle'), 'wb') as f:
  pickle.dump((X_train, y_train), f)
with open(os.path.join(base_save_path, 'eval.pickle'), 'wb') as f:
  pickle.dump((X_eval, y_eval), f)

### Train Model 1

Train Model 1, which is on cluster proportions.

In [None]:
# Load the training/evaluation accounts.
with open(os.path.join(base_save_path, 'training.pickle'), 'rb') as f:
  X_train, y_train = pickle.load(f)

# Get the cluster proportions from the dataframe.
df = pd.read_csv(f'/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData/normalized_cluster_matrix.csv')
df = df.loc[df['account'].isin(X_train)]
cluster_proportions = df[[f'cl_{i}' for i in range(7 + 1)]].values

In [None]:
# Fit the model to the data.
model = RandomForestClassifier(verbose = 0, random_state = 11823, n_jobs = -1)
model.fit(cluster_proportions, y_train)

# Save the results.
with open('/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData/model_1.pickle', 'wb') as f:
  pickle.dump(model, f)

### Train Model 2

Train Model 2, which is on embeddings.

In [None]:
# Data base path.
base_save_path = f'/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData'

# Load the images and embeddings.
with open(os.path.join(base_save_path, 'images.pickle'), 'rb') as f:
  images = np.array(pickle.load(f))
with open(os.path.join(base_save_path, 'embeddings.pickle'), 'rb') as f:
  embeddings = pickle.load(f)

# Load in the training data.
with open(os.path.join(base_save_path, 'training.pickle'), 'rb') as f:
  X_train, y_train = pickle.load(f)

# Create the averaged embeddings per account.
averaged_embeddings = []
for sample in tqdm(X_train, desc = "Generating Average Embeddings"):
  e = embeddings[[images.tolist().index(c) for c in [i for i in images if sample in i]]]
  averaged_embeddings.append(e.mean(0))
X = np.stack(averaged_embeddings)

In [None]:
# Fit the model to the data.
model = RandomForestClassifier(verbose = 0, random_state = 11823, n_jobs = -1)
model.fit(X, y_train)

# Save the results.
with open('/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData/model_2.pickle', 'wb') as f:
  pickle.dump(model, f)

## Evaluation

Run evaluation of a given model on the data.

In [None]:
# Load the evaluation data.
with open(os.path.join(base_save_path, 'eval.pickle'), 'rb') as f:
  X_eval, y_eval = pickle.load(f)

Load the inputted model, process the data using any inputted processing function, and then get the predictions.

In [None]:
# Load the inputted model.
with open('/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData/model_2.pickle', 'rb') as f:
  model = pickle.load(f)

# Process the data with the given processing function.
def process_data_model1(data):
  df = pd.read_csv(f'/content/drive/Shareddrives/Inf.Eco/PROJECTS/ImageDeduplication/Data/PhoMemesData/normalized_cluster_matrix.csv')
  df = df.loc[df['account'].isin(data)]
  return df[[f'cl_{i}' for i in range(7 + 1)]].values

def process_data_model2(data):
  # Load the images and embeddings.
  with open(os.path.join(base_save_path, 'images.pickle'), 'rb') as f:
    images = np.array(pickle.load(f))
  with open(os.path.join(base_save_path, 'embeddings.pickle'), 'rb') as f:
    embeddings = pickle.load(f)

  # Create the averaged embeddings per account.
  averaged_embeddings = []
  for sample in tqdm(data, desc = "Generating Average Embeddings"):
    e = embeddings[[images.tolist().index(c) for c in [i for i in images if sample in i]]]
    averaged_embeddings.append(e.mean(0))
  return np.stack(averaged_embeddings)

# Get the model's predictions.
y_pred = model.predict(process_data_model2(X_eval))

Generating Average Embeddings:   0%|          | 0/24 [00:00<?, ?it/s]

Get the evaluation score.

In [None]:
from sklearn.metrics import mean_squared_error
print("Ground Truth:\t\t", y_eval)
print("Predictions:\t\t", y_pred)
print("Mean Squared Error:\t", mean_squared_error(y_eval, y_pred))

Ground Truth:		 [0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0]
Predictions:		 [1 0 0 1 0 0 0 1 1 0 0 0 1 1 1 1 0 0 1 0 0 1 1 0]
Mean Squared Error:	 0.125
