# Setups
## Import Libraries

In [45]:
!pip install sentence-transformers



In [46]:
import sys
import os
import random

# Deep Learning
import torch
from torch.utils.data import DataLoader, Dataset, Subset
from torch.utils.data import random_split, ConcatDataset
from itertools import chain
from torch import nn
from torch.utils.data import DataLoader
import torch.optim as optim
# from torchsummary import summary
# from torch.nn import functional as F

# torchvision
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torchvision.utils import make_grid

# Image Processing
from PIL import Image

# Encoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

# texual embeddings
from sentence_transformers import SentenceTransformer

# Training
from sklearn.model_selection import train_test_split
# from skopt import BayesSearchCV
from sklearn.metrics import mean_absolute_error

# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# XGBoost / CatBoost
import xgboost as xgb

# DataFrame
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Check GPU

In [47]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU.")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

GPU


# Load Data

In [48]:
# path = '/kaggle/input/social-media-popularity-with-images/'
path = ''
# read json to dataframe
train_x = pd.read_json('{}data/train_data.json'.format(path))
test_x = pd.read_json('{}data/test_data.json'.format(path))

# read labels
train_y = pd.read_csv('{}data/train_label.csv'.format(path))
train_x['img_filepath'] = train_x['img_filepath'].apply(lambda x: '{}data/'.format(path) + x)
test_x['img_filepath'] = test_x['img_filepath'].apply(lambda x: '{}data/'.format(path) + x)
print(train_x.columns)
print(train_x.head())

Index(['Pid', 'Uid', 'Title', 'Alltags', 'Category', 'Concept', 'Subcategory',
       'Postdate', 'img_filepath'],
      dtype='object')
      Pid        Uid                                              Title  \
0  149005  22687@N84                                     having a drink   
1  149948  17614@N19  Foto Agne Sterberg, Destination Hga Kusten, AG...   
2  151388  17614@N19  Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...   
3  151389  17614@N19  Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...   
4  151390  17614@N19  Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...   

                                             Alltags              Category  \
0  life county wild bird water animal closeup fau...                  Food   
1  hav mitt hga kusten blsippor nordingr klippor ...  Travel&Active&Sports   
2  is sweden sverige hav soluppgng mitt vr hga ku...  Travel&Active&Sports   
3  is sweden sverige hav soluppgng mitt vr hga ku...  Travel&Active&Sports   
4  is sweden sverige h

## Data Exploration

In [49]:
EDA_df = train_x
EDA_test_df = test_x

EDA_df['Postdate'] = pd.to_datetime(EDA_df['Postdate'], unit='s')
EDA_test_df['Postdate'] = pd.to_datetime(EDA_test_df['Postdate'], unit='s')

# find the time interval of EDA & test

print(EDA_df['Postdate'].min())
print(EDA_df['Postdate'].max())
print(EDA_test_df['Postdate'].min())
print(EDA_test_df['Postdate'].max())

print(EDA_df['Postdate'])
print(EDA_test_df['Postdate'])

# print(EDA_df['Category'].value_counts())
# print(EDA_df['Subcategory'].value_counts())

# # plot y distribution with histogram
# plt.figure()
# train_y['label'].hist(bins=300)
# plt.title('Label Distribution')
# plt.show()

# # plot y distribution depends on time
# EDA_df['Postdate'] = pd.to_datetime(EDA_df['Postdate'], unit='s')
# date_distribution = train_y.merge(EDA_df[['Postdate', 'Pid']], on='Pid', how='left')
# date_distribution = date_distribution.set_index('Postdate')
# # group by hour & plot hist
# hour_distribution = date_distribution.resample('H').mean()
# plt.figure()
# hour_distribution['label'].hist(bins=1000)
# plt.title('Label Distribution over Time')
# plt.show()

# # group by weekday
# weekday_distribution = date_distribution.resample('D').mean()
# plt.figure()
# weekday_distribution['label'].hist(bins=300)
# plt.title('Label Distribution over Time')
# plt.show()


# # find unique values of Category, Concept, Subcategory
# print(len(EDA_df['Category'].unique()))
# print(len(EDA_df['Concept'].unique()))
# print(len(EDA_df['Subcategory'].unique()))
# print(len(EDA_df['Uid'].unique()))

# # check if there is any missing value
# print(EDA_df.isnull().sum())

# # check if Uid appears in test but not in train
# train_uid = set(EDA_df['Uid'].unique())
# test_uid = set(test_x['Uid'].unique())
# print(len(test_uid - train_uid))

# # check if Concept appears in test but not in train
# train_concept = set(EDA_df['Concept'].unique())
# test_concept = set(test_x['Concept'].unique())
# print(len(test_concept - train_concept))

# # check if Category appears in test but not in train
# train_category = set(EDA_df['Category'].unique())
# test_category = set(test_x['Category'].unique())
# print(len(test_category - train_category))

# # check if Subcategory appears in test but not in train
# train_subcategory = set(EDA_df['Subcategory'].unique())
# test_subcategory = set(test_x['Subcategory'].unique())
# print(len(test_subcategory - train_subcategory))


2015-02-28 19:56:09
2016-02-29 15:34:45
2016-02-29 16:00:00
2016-06-10 15:43:52
0       2015-03-13 03:21:30
1       2015-03-17 02:05:20
2       2015-03-24 21:29:17
3       2015-03-24 10:18:36
4       2015-03-24 21:55:46
                ...        
14995   2015-10-16 06:39:42
14996   2016-01-16 06:13:16
14997   2015-10-03 06:15:54
14998   2015-12-22 10:41:30
14999   2015-05-05 01:15:44
Name: Postdate, Length: 15000, dtype: datetime64[ns]
0      2016-03-17 15:59:59
1      2016-03-17 15:59:01
2      2016-03-15 15:59:59
3      2016-03-02 03:50:37
4      2016-03-02 15:33:32
               ...        
4995   2016-06-04 07:07:41
4996   2016-04-06 08:15:50
4997   2016-03-26 16:14:17
4998   2016-05-10 10:11:35
4999   2016-03-05 03:38:40
Name: Postdate, Length: 5000, dtype: datetime64[ns]


# Feature Engineering
## Time Feature

In [50]:
def time_feature_engineering(df, start_t):
    df['Postdate'] = pd.to_datetime(df['Postdate'], unit='s')

    # groupby weekday and encode in 7 dimension
    df['weekday'] = df['Postdate'].dt.weekday
    weekday_df = pd.get_dummies(df['weekday']).astype(int)
    df = pd.concat([df, weekday_df], axis=1)

    combined_categories = [i for i in range(1, 13)]
    encoder = OneHotEncoder(categories=[combined_categories], sparse=False)
    encoder.fit([[cat] for cat in combined_categories]) 
    df['month'] = df['Postdate'].dt.month
    train_encoded = pd.DataFrame(encoder.transform(df[['month']]), columns=encoder.get_feature_names_out(['month'])).astype(int)
    df = pd.concat([df, train_encoded], axis=1)

    # transform hour to morning (06:00 to 11:59), afternoon (12:00 to 17:59), evening (18:00 to 23:59), or night (00:00 to 05:59).
    df['hour'] = df['Postdate'].dt.hour
    df['morning'] = df['hour'].apply(lambda x: 1 if 6 <= x < 12 else 0)
    df['afternoon'] = df['hour'].apply(lambda x: 1 if 12 <= x < 18 else 0)
    df['evening'] = df['hour'].apply(lambda x: 1 if 18 <= x < 24 else 0)
    df['night'] = df['hour'].apply(lambda x: 1 if 0 <= x < 6 else 0)

    # get post duration
    if start_t is None:
        start_t = df['Postdate'].min()
    df['Postduration'] = df['Postdate'] - start_t
    df['Postduration'] = df['Postduration'].dt.total_seconds()
    df = df.drop(columns=['weekday', 'month', 'hour', 'Postdate'])
    
    return df, start_t

## Texual Embedding

In [52]:
def get_textual_embeddings(data):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device = 'cuda')
    embeddings = model.encode(data['all_text'].tolist())

    columns = ['text_embedding_' + str(i) for i in range(384)]
    embeddings = pd.DataFrame(embeddings, columns=columns)
    data[columns] = embeddings

    return data

## Categorial Embedding

In [53]:
# get categorical embedding by PCA

def PCA_embedding(data, columns, n_components = 10, pca_model=None, encoder=None):
    # One-hot encode the categories that appear in both training and test datasets
    if encoder:
        encoded_data = encoder.transform(data[[columns]])
    else:
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        encoded_data = encoder.fit_transform(data[[columns]])
    
    if pca_model == None:
        pca_model = PCA(n_components=n_components)
        pca_result = pca_model.fit_transform(encoded_data)
    else:
        # Apply PCA model trained on training data to test data
        pca_result = pca_model.transform(encoded_data)
    
    columns = ['{}_Embedding_'.format(columns) + str(i) for i in range(n_components)]
    pca_result = pd.DataFrame(pca_result, columns=columns)
    data[columns] = pca_result

    return data, pca_model, encoder

# Normalization

In [54]:
def numerical_feature_normalization(df, columns, mean=None, std=None):
    if mean is None:
        mean = df[columns].mean()
        std = df[columns].std()
        df[columns] = (df[columns] - mean) / std
        return df, mean, std
    df[columns] = (df[columns] - mean) / std
    return df

# Preprocessing

## without LSTM

In [56]:
def feature_engineering(df, label_encoder=None, embeddings=None, mean=None, std=None, start_time=None):
    
    df['tags_count'] = df['Alltags'].apply(lambda x: len(x.split(' ')))
    df['title_len'] = df['Title'].apply(lambda x: len(x))
    df['all_text'] = df['Title'] + ' ' + df['Alltags']

#     get textual embeddings
    df = get_textual_embeddings(df)
    n_comp = {'Concept':20, 'Subcategory':10, 'Uid':100}
    embeddings = []
    label_encoder = {}
    # encoding_col = ['Concept', 'Subcategory', 'Uid']
    encoding_col = ['Concept']
#     get Category embeddings
    if len(embeddings) > 0:
        for i, col in enumerate(encoding_col):
            df, _, _ = PCA_embedding(df, col, n_components=n_comp[col], pca_model=embeddings[i], encoder=label_encoder[col])
    else:
        for i, col in enumerate(encoding_col):
            df, concept_embeddings, concept_encoder = PCA_embedding(df, col, n_components=n_comp[col])
            embeddings.append(concept_embeddings)
            label_encoder[col] = concept_encoder
    # one hot encoding
    cat_df = pd.get_dummies(df['Category'], dummy_na=True).astype(int)
    df = pd.concat([df, cat_df], axis=1)

    # time feature engineering
    df, start_time = time_feature_engineering(df, start_time)
    
    df.drop(['Pid', 'Title', 'Alltags', 'Uid','Category', 'Concept', 'Subcategory', 'all_text'], axis=1, inplace=True)

    text_embedding_columns = ['text_embedding_{}'.format(i) for i in range(384)]
    concept_embedding_columns = ['Concept_Embedding_{}'.format(i) for i in range(20)]

    subcategory_embedding_columns = ['Subcategory_Embedding_{}'.format(i) for i in range(10)]
    Uid_embedding_columns = ['Uid_Embedding_{}'.format(i) for i in range(100)]
    numeric_feature = ['tags_count', 'title_len', 'Postduration']
    for i in [text_embedding_columns, concept_embedding_columns]:
    # for i in [concept_embedding_columns, subcategory_embedding_columns, Uid_embedding_columns]:
        numeric_feature.extend(i)
    # categorial_feature = ['Category_Animal', 'Category_Electronics', 'Category_Entertainment', 'Category_Family', 'Category_Fashion', 'Category_Food', 'Category_Holiday&Celebrations', 'Category_Social&People', 'Category_Travel&Active&Sports', 'Category_Urban', 'Category_Whether&Season', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'morning', 'afternoon', 'evening', 'night']

    if mean is None:
        df, train_mean, train_std = numerical_feature_normalization(df, numeric_feature)
        return df, label_encoder, embeddings, train_mean, train_std, start_time

    df = numerical_feature_normalization(df, numeric_feature, mean, std)
    return df

# Models
## MultiModal (CNN + Feature)

In [58]:
class MultimodalModel(nn.Module):
    def __init__(self, 
                image_feature_size=(3, 128, 128), 
                lr=0.01, 
                lossfn=nn.MSELoss(), 
                hidden_layers=(256, 128, 64)):
        super(MultimodalModel, self).__init__()
        
        # Image branch
        # Image branch
        self.image_branch = nn.Sequential(
                nn.LazyConv2d(out_channels=6, kernel_size=5),
                nn.LazyBatchNorm2d(),
                nn.Sigmoid(),
                nn.AvgPool2d(kernel_size=2,stride=2),

                # Layer 2
                nn.LazyConv2d(out_channels=16, kernel_size=5),
                nn.LazyBatchNorm2d(),
                nn.Sigmoid(),
                nn.AvgPool2d(kernel_size=2,stride=2),


                # Dense Layers
                nn.Flatten(),
                nn.Linear((image_feature_size[1]-12)**2, 1024),
                nn.Sigmoid(),
                nn.LazyLinear(256),
                nn.Sigmoid(),
                nn.LazyLinear(128)
        )
    
        
        # Dynamically calculate the input size for the linear layer after the convolutional layers
        with torch.no_grad():
            dummy_image = torch.zeros(1, *image_feature_size)
            dummy_output = self.image_branch(dummy_image)
            dummy_flatten_size = dummy_output.view(dummy_output.size(0), -1).size(1)
        
        # Set the input size for the linear layer
        input_size = dummy_flatten_size + hidden_layers[-1]  # Output size of the CNN + Output size of the last hidden layer
        self.fusion_layer = nn.Sequential(
            nn.Linear(input_size, hidden_layers[-1]),
            nn.ReLU(),
            nn.Linear(hidden_layers[-1], 128),  # Regression output
            nn.ReLU(),
            nn.Linear(128, 1)
        )

        self.lr = lr
        self.lossfn = lossfn

        assert len(hidden_layers) > 0 and hidden_layers
        self.num_branch = nn.Sequential(nn.Flatten())
        self.num_branch.add_module('hidden', self.hiddenLayers(hidden_layers))

    def loss(self, y_hat, y):
        target = y.float().type(torch.FloatTensor)
        target = target.to(y_hat.device)
        return self.lossfn(y_hat, target)    

    # Create the hidden layers, except the output layer
    def hiddenLayers(self,hidden_layers):
        layers = ((nn.LazyLinear(outputSize),nn.LazyBatchNorm1d(),nn.ReLU()) for outputSize in hidden_layers)
        layers = tuple(chain.from_iterable(layers))
        return nn.Sequential(*layers)
        
    def forward(self, images, numerical_features):
        # Image branch
        image_out = self.image_branch(images)
        image_out = image_out.view(image_out.size(0), -1)  # Flatten
        
        # Numerical features branch
        num_out = self.num_branch(numerical_features)
        
        # Concatenate image and numerical features
        concatenated = torch.cat((image_out, num_out), dim=1)
        
        # Fusion layer
        output = self.fusion_layer(concatenated)
        return output
    
    def configure_optimizers(self, weight_decay=0.01):
        return torch.optim.Adam(self.parameters(),
                                self.lr,
                                weight_decay=weight_decay,
                                betas=(0.9, 0.999))

    def setLearningRate(self, lr):
        self.lr = lr

    def getLearningRate(self):
        return self.lr

    def getName(self):
        return type(self).__name__


# Trainer

In [59]:
class Trainer:

  def __init__(self,
               writer=None,
               n_epochs=3,
               verbose=False,
               ignore_accuracy=False,
               input_as_target=False,
               compute_psnr=False):
    self.max_epochs = n_epochs
    self.accuracies = []
    self.verbose = verbose

    # SummaryWriter for TensorBoard
    self.writer = writer

    # this for regression problems
    self.ignore_accuracy = ignore_accuracy

    # this for autoencoder output comparisons
    self.compute_psnr = compute_psnr

    # setting this true is for autoencoders that use the inputs
    # to train the output of a encoder-decoder architecture
    self.input_as_target = input_as_target
    return

  # Add a new function to get the target type
  def get_target_type(self):
    return torch.long if self.target_type == 'long' else torch.float

  # The fitting step
  def fit(self, model, train_loader, val_loader, weight_decay=0.01):

    self.training_data    = train_loader
    self.validation_data  = val_loader
    self.epoch_stats = {'Training':[],
                        'Validation':[],
                        'Accuracy':[],
                        'PSNR':[]}
    self.exclude = []

    # configure the optimizer
    self.optimizer = model.configure_optimizers(weight_decay=weight_decay)
    self.model     = model

    if self.verbose:
      print(f'Starting {model.getName()} model training for {self.max_epochs} epochs:')

    for epoch in range(self.max_epochs):

      # Compute training loss
      tr = self.fit_epoch()
      # Compute validation loss and accuracy
      acc, val, psnr = self.validate()

      self.epoch_stats['Training'].append(tr)
      self.epoch_stats['Accuracy'].append(acc)
      self.epoch_stats['Validation'].append(val)
      self.epoch_stats['PSNR'].append(psnr)

      if self.writer:
        # send loss to TensorBoard
        self.writer.add_scalars("Loss",{'validation': val,'training': tr},epoch)
        # send accuracy to TensorBoard
        if not self.ignore_accuracy:
           self.writer.add_scalar("Accuracy", acc, epoch)
        # send psnr to TensorBoard
        if self.compute_psnr:
          self.writer.add_scalar("PSNR", psnr, epoch)

      if self.verbose:
        print(f'\tLoss at epoch {epoch}: T {tr:,.3f} V {val:,.3f}')
      if not self.ignore_accuracy:
        print(f'\tAccuracy after epoch {epoch}: {acc*100:.3f} %')
      if self.compute_psnr:
        print(f'\tPSNR after epoch {epoch}: {psnr:.3f} %\n')
        plot_ae_outputs( self.model.encoder, self.model.decoder, n=10)

    

    return {  'epoch': self.max_epochs,
              'model_state_dict': self.model.state_dict(),
              'optimizer_state_dict': self.optimizer.state_dict(),
              'statistics': self.epoch_stats}

  def fit_epoch(self):

    # set model to training mode
    self.model.train()

    # current_loss = 0.0
    all_losses = []

    # iterate over the DataLoader for training data
    for i, data in enumerate(self.training_data,0):

      # Get input and send mini-batch to GPU
      input_feature = data[0].to(device)
      input_image = data[1].to(device)
      target = input if self.input_as_target else data[2].to(device)
      #print(target.shape)

      # Clear gradient buffers because we don't want any gradient from previous
      # epoch to carry forward, dont want to cummulate gradients
      self.optimizer.zero_grad()

      # get output from the model, given the inputs
      # never call forward directly
      output = self.model(input_image, input_feature)
      #print(output.shape)

      # get loss for the predicted output
      loss = self.model.loss(torch.reshape(output, (-1,)), target)

      # save the loss for plotting
      all_losses.append(loss.item())

      # get gradients w.r.t the parameters of the model
      loss.backward()

      # update the parameters (perform optimization)
      self.optimizer.step()

      # Let's print some statisics
      # current_loss += loss.item()
      # if i % 500 == 499:
      #     # print('Loss after mini-batch %5d: %.3f' %
      #     #        (i + 1, current_loss / 500))
      #     current_loss = 0.0

    return sum(all_losses)/len(all_losses)  # return average loss

  def validate(self):

      self.model.eval()  # Set the model to evaluation mode

      correct = 0
      total = 0
      avg_loss = 0.
      avg_psnr = 0.
      accuracy = 0.

      with torch.no_grad():  # Do not compute gradients

          for data in self.validation_data:

              input_feature = data[0].to(device)
              input_image = data[1].to(device)
              target = input if self.input_as_target else data[2].to(device)

              output = self.model(input_image, input_feature)

              # get loss for the predicted output
              avg_loss += self.model.loss(torch.reshape(output, (-1,)),target)

              # calculate psnr for autoencoding problems
              if self.compute_psnr:
                avg_psnr += psnr(output, target)

              # calculate accuracy for classification problems
              if not self.ignore_accuracy:
                _, predicted = torch.max(output, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()

      if not self.ignore_accuracy:
        accuracy = correct / total

      if isinstance(avg_loss, torch.Tensor):
          # Convert tensor to a float
          avg_loss = float(avg_loss.item())

      avg_loss /= len(self.validation_data)

      if isinstance(avg_psnr, torch.Tensor):
          # Convert tensor to a float
          avg_psnr = float(avg_psnr.item())

      avg_psnr /= len(self.validation_data)

      return accuracy, avg_loss, avg_psnr




def train(model,
          n_epochs,
          train_dataset,
          val_dataset,
          batch_size,
          weight_decay=1e-05,
          writer=None,
          use_saved_weights=False,
          new_lr=None,
          verbose=False,
          ignore_accuracy=False,
          input_as_target=False,
          compute_psnr=False):

  # Create DataLoaders for each set
  train_loader = DataLoader(train_dataset,
                            shuffle=True,
                            batch_size=batch_size,
                            num_workers=2)

  val_loader = DataLoader(val_dataset,
                          shuffle=True,
                          batch_size=batch_size,
                          num_workers=2)

  # Check if the weights file exists, if true, fine tune learning rate
  if 'MODEL_PATH' in globals() and os.path.isfile(MODEL_PATH) and use_saved_weights:

      # Load the model from the file
      checkpoint = torch.load(MODEL_PATH)
      # print(f'Saved model: {MODEL_PATH}')

      model.load_state_dict(checkpoint['model_state_dict'])
      if verbose:
        print(f'Loaded model weights')

      optimizer = model.configure_optimizers()
      optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
      # print(f'Loaded optimizer parameters')

      # Fine tuning the learning rate
      learning_rate = new_lr if new_lr else model.getLearningRate()

      for param_group in optimizer.param_groups:
          param_group['lr'] = learning_rate
      # print(f'Learning rate: {learning_rate} Epochs: {n_epochs}')
  else:
      if verbose:
        print("No weights file found. Using randomly initialized weights.\n")

  # Training the network
  # Creating the trainer class
  trainer = Trainer(writer,
                    n_epochs=n_epochs,
                    verbose=verbose,
                    ignore_accuracy=ignore_accuracy,
                    input_as_target=input_as_target,
                    compute_psnr=compute_psnr)

  # Training the model
  checkpoint = trainer.fit(model, train_loader, val_loader, weight_decay)

  # Plot the results for average training and validation loss

  return model, trainer, checkpoint

# Tuner

In [60]:
def logScale(a,b):
  assert a < b
  lb = np.log10(a)
  ub = np.log10(b)
  return np.arange(lb,ub,0.5)

def RandomLayers(layers,nodes):
  hidden_layers = []
  l = random.choice(layers)
  for layer in range(l):
    n = random.choice(nodes)
    hidden_layers.append(n)
  return tuple(hidden_layers)

def RandomChoice(hyperparams):
  return { key:random.choice(value) for _,(key,value) in enumerate(hyperparams.items()) }

def k_fold(k,
           model,
           num_epochs,
           trainset,
           batch_size,
           weight_decay,
           learning_rate,
           verbose=False,
           ignore_accuracy=True,
           input_as_target=False,
           compute_psnr=False):

    accum_train_loss, accum_valid_loss = 0, 0

    assert k <= len(trainset)
    for i in range(k):

        # Get the ith fold
        train_dataset, val_dataset = get_k_fold_data(k, i, trainset)

        # Train using it num_epochs times
        model, _, checkpoint = train(model,
                                     num_epochs,
                                     train_dataset,
                                     val_dataset,
                                     batch_size,
                                     weight_decay,
                                     writer=None,
                                     new_lr=None,
                                     use_saved_weights=False,
                                     verbose=verbose,
                                     ignore_accuracy=ignore_accuracy,
                                     input_as_target=input_as_target,
                                     compute_psnr=compute_psnr)

        # Retrieve the statistics for the ith fold
        # which contains num_epochs
        stats = checkpoint['statistics']
        avg_train_loss = np.mean(stats['Training'])
        avg_valid_loss = np.mean(stats['Validation'])

        accum_train_loss += avg_train_loss
        accum_valid_loss += avg_valid_loss

        if verbose:
          print('fold %d/%d, epochs: %d, train loss: %f, valid loss: %f\n' % (
              i, k, num_epochs, avg_train_loss, avg_valid_loss))

    return accum_train_loss/k, accum_valid_loss/k

def get_k_fold_data(k, i, combined_dataset):

    fold_size = len(combined_dataset) // k
    indices = list(range(len(combined_dataset)))

    val_indices = indices[i * fold_size: (i+1) * fold_size]
    train_indices = indices[:i * fold_size] + indices[(i+1) * fold_size:]

    train_subset = Subset(combined_dataset, train_indices)
    val_subset = Subset(combined_dataset, val_indices)

    return train_subset, val_subset

def is_list_of_type(lst,type=float):
  return all(isinstance(item, type) for item in lst)

def plot_curves( stats_dict, xlabel, ylabel, scale='log', title=None, exclude=[] ):

  # Plot losses
  plt.figure(figsize=(10,8))
  for (key, data) in stats_dict.items():
    if key not in exclude:
      if torch.is_tensor(data):
          data_np = data.numpy()
      elif is_list_of_type(data,float) or is_list_of_type(data,int):
          data_np = np.array(data)
      else:
        raise TypeError("Unable plot input data type.")

      # Using a log scale on y-axis by default
      if scale == 'log':
        plt.semilogy(data_np, label=key)
      else:
        plt.plot(data_np, label=key)

  plt.xlabel(xlabel)
  plt.ylabel(ylabel)
  plt.legend()
  if title:
    plt.title(title)
  plt.show()

# Load Images

In [61]:
class MultimodalDataset(Dataset):
    def __init__(self, x, image_filepath, y=None, transform=None):
        self.image_paths = image_filepath  # Adjust this to match the column name containing image paths
        self.dataframe = x
        self.transform = transform
        self.y = y

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Extract information for the current row
        image_path = self.image_paths[int(idx)]
        
        # Load the image
        image = Image.open(image_path)  # Ensure RGB mode
        # Apply transformations if specified
        if self.transform:
            image = self.transform(image)
        if(image.shape[0]):
            image = image.expand(3, -1, -1)
        # print(image)
        # Extract other relevant data from the row if needed
        # For example: label = row['label']
        
        # Return image and other data
        if self.y is None:
            return self.dataframe[int(idx)], image
        else:
            return self.dataframe[int(idx)], image, self.y[int(idx)]

### def load_data

In [62]:
# Load torchvision datasets like CIFAR10 and MNIST
def load_data(data,
              img_filepath_index,
              y=None,
              augs=None,
              resize=(128,128),
              channel_mean=None,
              channel_stdd=None):

    # Normalization step in image processing for neural networks is
    # usually performed after an image has been converted into a tensor
    # using transforms.ToTensor(). This converts the image from a PIL Image or
    # numpy array with dimensions (Height, Width, Channels) and pixel
    # range [0, 255] to a float tensor with dimensions (Channels, Height, Width)
    # and pixel values scaled down to the range [0.0, 1.0].
    if channel_mean and channel_stdd:
      base_transform = transforms.Compose([
          torchvision.transforms.Resize(resize),
          # Convert to tensor
          transforms.ToTensor(),  # data scaled to 0 to 1
          # Normalize (example values for normalization, should be adjusted based on dataset)
          transforms.Normalize(channel_mean, channel_stdd) # rescaled to -1 to 1
        ])
    else:
      base_transform = transforms.Compose([
         torchvision.transforms.Resize(resize),
          transforms.ToTensor(),
      ])

    if augs:
      transform = transforms.Compose([
          # Augmentations
          *augs.transforms,
          # Preprocessing
          *base_transform.transforms,
        ])
    else:
      transform = base_transform
    image_path = data[:, 0]
        
    data = torch.tensor(data[:, 1:].astype(np.float32), dtype=torch.float32)

    if y is not None:
      y = torch.tensor(y, dtype=torch.float32)
      trainset = MultimodalDataset(data, image_path, y, transform=transform)
      return trainset
    trainset = trainset = MultimodalDataset(data, image_path, transform=transform)
    return trainset

# For visualizing images
def unnormalize(img):
    img = img / 2 + 0.5     # unnormalize
    return img

### Visualization

In [63]:
## Let's see some images
def imshow(img):
    npimg = img.numpy()  # convert PyTorch tensor to numpy
    if len(npimg.shape) == 3:  # RGB image
        plt.imshow(np.transpose(npimg, (1, 2, 0)))  # transpose (C, H, W) to (H, W, C)
    else:  # Grayscale image
        plt.imshow(npimg[0], cmap='gray')  # display the first channel
    plt.show()

### Check pictures

In [64]:
# Define transformations to preprocess the images

# Randomly select a few images from the training set

# trainset = load_data(train_x.values,
#                     train_y,
#                     resize=(256, 256),
#                     channel_mean=(0.5,0.5,0.5),
#                     channel_stdd=(0.5,0.5,0.5))

# indices = torch.randperm(len(trainset))[:4]

# # Prepare images and labels
# images = torch.stack([trainset[i][1] for i in indices])

# # Show images and labels
# # create grid of images
# img_grid = torchvision.utils.make_grid(images)
# # print(img_grid)
# img_grid.unnorm = unnormalize(img_grid)
# imshow(img_grid.unnorm)

# Main

## Preprocess

In [65]:
random_seed = 42
test_Pid = test_x['Pid']
train_x, label_encoder, embeddings, train_mean, train_std, start_time = feature_engineering(train_x)
test_x = feature_engineering(test_x, label_encoder, embeddings, train_mean, train_std, start_time)
print(test_x)
img_filepath_index = train_x.columns.get_loc('img_filepath')
# Split the data
y_train = train_y['label']
X_train, X_valid, y_train, y_valid = train_test_split(train_x.values,
                                                      y_train.values.reshape(-1),
                                                      test_size=0.2,
                                                      shuffle=True)

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns

                                           img_filepath  tags_count  \
95    /kaggle/input/social-media-popularity-with-ima...   -0.658935   
2210  /kaggle/input/social-media-popularity-with-ima...    2.626342   
2049  /kaggle/input/social-media-popularity-with-ima...    2.447145   
2393  /kaggle/input/social-media-popularity-with-ima...    1.790090   
2392  /kaggle/input/social-media-popularity-with-ima...    1.670625   
...                                                 ...         ...   
2827  /kaggle/input/social-media-popularity-with-ima...   -0.539470   
2815  /kaggle/input/social-media-popularity-with-ima...   -0.539470   
3654  /kaggle/input/social-media-popularity-with-ima...   -0.539470   
2926  /kaggle/input/social-media-popularity-with-ima...   -1.077061   
3823  /kaggle/input/social-media-popularity-with-ima...   -0.599203   

      title_len  text_embedding_0  text_embedding_1  text_embedding_2  \
95    -0.222972          0.328392          1.179316          0.474395   
2

## Tuning

In [66]:
resize=(256,256)

test_dataset = load_data(test_x.values,
              img_filepath_index,
              augs=None,
              resize=resize,
              channel_mean=None,
              channel_stdd=None)

train_dataset = load_data(X_train,
              img_filepath_index,
              y_train,
              augs=None,
              resize=resize,
                  channel_mean=None,
                  channel_stdd=None)

val_dataset = load_data(X_valid,
                img_filepath_index,
                y_valid,
                augs=None,
                resize=resize,
                  channel_mean=None,
                  channel_stdd=None)

full_trainset = load_data(train_x.values,
              img_filepath_index,
              augs=None,
              resize=resize,
              channel_mean=None,
              channel_stdd=None)

# # Choose random layers and nodes per layer
# layers = np.arange(2,10,1)
# nodes = np.arange(20,160,20)

# # Choose random values for hyperparameters
# batch_size = 128
# lr = np.round(10**logScale(0.0001,1),5)
# decay = np.round(10**logScale(0.001,0.2),5)
# image_resize = [(3, i**2, i**2) for i in [28, 64, 128]]
# hyperparams = {
#                'lr':lr,
#                'decay':decay,
#                'resize': image_resize
#                }

# # Choose experiment parameters
# sample_size = 5
# kfolds = 2
# num_epochs = 5

# # Find the best sample
# results = []
# best_result = sys.float_info.max
# best_index = 0
# i = 0

# while i < sample_size:

#   choice = RandomChoice(hyperparams)
#   choice['hidden'] = RandomLayers(layers,nodes)
#   model = MultimodalModel(hidden_layers=choice['hidden'], 
#               image_feature_size=choice['resize'],
#               lr=choice['lr'],
#               lossfn=nn.MSELoss(reduction='sum'))
  
#   train_dataset = load_data(X_train,
#               img_filepath_index,
#               y_train,
#               augs=None,
#               resize=(choice['resize'][1], choice['resize'][2]),
#               channel_mean=None,
#               channel_stdd=None)

#   val_dataset = load_data(X_valid,
#                 img_filepath_index,
#                 y_valid,
#                 augs=None,
#                 resize=(choice['resize'][1], choice['resize'][2]),
#                 channel_mean=None,
#                 channel_stdd=None)
#   k_fold_trainset = ConcatDataset([train_dataset, val_dataset])
#   del train_dataset, val_dataset
#   model = model.to(device)
#   avg_train_loss, avg_valid_loss = k_fold(k=kfolds,
#                                           model=model,
#                                           num_epochs=num_epochs,
#                                           trainset=k_fold_trainset,
#                                           batch_size=batch_size,
#                                           weight_decay=float(choice['decay']),
#                                           learning_rate=int(choice['lr']),
#                                           verbose=True,
#                                           ignore_accuracy=True,
#                                           input_as_target=False,
#                                           compute_psnr=False)
#   variance = abs(avg_train_loss-avg_valid_loss)

#   print(f'-- Sample {i}/{sample_size} Average Training Loss: {avg_train_loss:,.2f}')
#   print(f'-- Sample {i}/{sample_size} Difference with Validation: {variance:,.2f}\n')

#   if avg_train_loss < best_result:
#     best_result = avg_train_loss
#     best_index = i

#   results.append([avg_train_loss,variance,choice])
#   i += 1

# print(f'The best set of hyperparameters from')
# print(f'Sample Size: {sample_size} Folds: {kfolds} Epochs: {num_epochs} is')
# print(f'Sample #{best_index}: \n\tAverage Loss {results[best_index][0]:,.2f} and \n\tVariance {results[best_index][1]:,.2f}.\n')
# print(f'Parameters: {results[best_index][2]}')

# tuned_hparams = results[best_index][2]

## Training

In [67]:
y_train = train_y['label']
if 'tuned_hparams' in globals():
    print("Using tuned hyperparameters:")
    print(f'{tuned_hparams}')
    model = MultimodalModel(hidden_layers=tuned_hparams['hidden'], 
              image_feature_size=tuned_hparams['resize'],
              lr=tuned_hparams['lr'],
              lossfn=nn.MSELoss())
    batch_size = int(tuned_hparams['bsize'])
    weight_decay = float(tuned_hparams['decay'])
    full_trainset = load_data(train_x.values,
              img_filepath_index,
              y_train.values.reshape(-1),
              augs=None,
              resize=(tuned_hparams['resize'][1], tuned_hparams['resize'][2]),
              channel_mean=None,
              channel_stdd=None)
else:
    print("No tuned hyperparameters found.")

    model = MultimodalModel(image_feature_size=(3, resize[0], resize[1]))
    batch_size = 500
    weight_decay = 1e-05
    full_trainset = load_data(train_x.values,
              img_filepath_index,
              y_train.values.reshape(-1),
              augs=None,
              resize=(128,128),
              channel_mean=None,
              channel_stdd=None)

model = model.to(device)

_, _, checkpoint = train(model,
                         n_epochs=10,
#                          train_dataset=full_trainset,
                         train_dataset=train_dataset,
                         val_dataset=val_dataset,
                         batch_size=batch_size,
                         weight_decay=weight_decay,
                         new_lr=None,
                         use_saved_weights=False,
                         # To show epoch loss changes, set to True
                         verbose=True,
                         # For autoencoder, set to True
                         ignore_accuracy=False,
                         # set this to true for autoencoder
                         input_as_target=False,
                         # set this to true for autoencoder
                         compute_psnr=False)

No tuned hyperparameters found.




No weights file found. Using randomly initialized weights.

Starting MultimodalModel model training for 10 epochs:
	Loss at epoch 0: T 24.109 V 8.633
	Accuracy after epoch 0: 0.000 %
	Loss at epoch 1: T 6.653 V 6.700
	Accuracy after epoch 1: 0.000 %
	Loss at epoch 2: T 6.211 V 6.710
	Accuracy after epoch 2: 0.000 %


KeyboardInterrupt: 

# Testing

In [None]:
# print(test_loader)

model.eval()

test_loader = DataLoader(test_dataset,
                          shuffle=True,
                          batch_size=batch_size,
                          num_workers=2)

test_x = pd.read_json('{}data/test_data.json'.format(path))
test_prediction = []

with torch.no_grad():
  # iterate over the DataLoader for training data
  for i, data in enumerate(test_loader,0):
    input_feature = data[0].to(device)
    input_image = data[1].to(device)
    # Now, let's see what the network thinks these examples are
    prediction = model(input_image, input_feature)
    for i in prediction:
        test_prediction.append(i.item())

print(len(test_prediction))
# convert test_ids to a dataframe
submission = pd.DataFrame({'Pid': test_x['Pid']})
submission['label'] = test_prediction
print(submission)
print(submission.shape)
submission.to_csv('/kaggle/working/predictions.csv',index=False)