# Setups
## Import Libraries

In [28]:
!pip install sentence-transformers



In [29]:
import sys
import os
import random

# Deep Learning
import torch
from torch.utils.data import DataLoader, Dataset, Subset
from torch.utils.data import random_split, ConcatDataset
from itertools import chain
from torch import nn
from torch.utils.data import DataLoader
import torch.optim as optim
import gc
# from torchsummary import summary
# from torch.nn import functional as F

# torchvision
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.utils import make_grid

# Image Processing
from PIL import Image

# Encoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

# texual embeddings
from sentence_transformers import SentenceTransformer

# Training
from sklearn.model_selection import train_test_split
# from skopt import BayesSearchCV
from sklearn.metrics import mean_absolute_error
# svd
from sklearn.decomposition import TruncatedSVD

# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# XGBoost / CatBoost
import xgboost as xgb
import catboost as ctb

# DataFrame
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import Word2Vec

## Check GPU

In [30]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU.")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

GPU


# Load Data

In [31]:
path = '/kaggle/input/social-media-popularity-with-images/'
# path = ''
# read json to dataframe
train_x = pd.read_json('{}data/train_data.json'.format(path))
test_x = pd.read_json('{}data/test_data.json'.format(path))

# read labels
train_y = pd.read_csv('{}data/train_label.csv'.format(path))
train_x['img_filepath'] = train_x['img_filepath'].apply(lambda x: '{}data/'.format(path) + x)
test_x['img_filepath'] = test_x['img_filepath'].apply(lambda x: '{}data/'.format(path) + x)
print(train_x.columns)
print(train_x.head()) 

Index(['Pid', 'Uid', 'Title', 'Alltags', 'Category', 'Concept', 'Subcategory',
       'Postdate', 'img_filepath'],
      dtype='object')
      Pid        Uid                                              Title  \
0  149005  22687@N84                                     having a drink   
1  149948  17614@N19  Foto Agne Sterberg, Destination Hga Kusten, AG...   
2  151388  17614@N19  Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...   
3  151389  17614@N19  Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...   
4  151390  17614@N19  Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...   

                                             Alltags              Category  \
0  life county wild bird water animal closeup fau...                  Food   
1  hav mitt hga kusten blsippor nordingr klippor ...  Travel&Active&Sports   
2  is sweden sverige hav soluppgng mitt vr hga ku...  Travel&Active&Sports   
3  is sweden sverige hav soluppgng mitt vr hga ku...  Travel&Active&Sports   
4  is sweden sverige h

## Data Exploration

In [32]:
# EDA_df = train_x
# EDA_test_df = test_x

# EDA_df['Postdate'] = pd.to_datetime(EDA_df['Postdate'], unit='s')
# EDA_test_df['Postdate'] = pd.to_datetime(EDA_test_df['Postdate'], unit='s')

# # find the time interval of EDA & test

# print(EDA_df['Postdate'].min())
# print(EDA_df['Postdate'].max())
# print(EDA_test_df['Postdate'].min())
# print(EDA_test_df['Postdate'].max())

# print(EDA_df['Postdate'])
# print(EDA_test_df['Postdate'])

# print(EDA_df['Category'].value_counts())
# print(EDA_df['Subcategory'].value_counts())

# # plot y distribution with histogram
# plt.figure()
# train_y['label'].hist(bins=300)
# plt.title('Label Distribution')
# plt.show()

# # plot y distribution depends on time
# EDA_df['Postdate'] = pd.to_datetime(EDA_df['Postdate'], unit='s')
# date_distribution = train_y.merge(EDA_df[['Postdate', 'Pid']], on='Pid', how='left')
# date_distribution = date_distribution.set_index('Postdate')
# # group by hour & plot hist
# hour_distribution = date_distribution.resample('H').mean()
# plt.figure()
# hour_distribution['label'].hist(bins=1000)
# plt.title('Label Distribution over Time')
# plt.show()

# # group by weekday
# weekday_distribution = date_distribution.resample('D').mean()
# plt.figure()
# weekday_distribution['label'].hist(bins=300)
# plt.title('Label Distribution over Time')
# plt.show()


# # find unique values of Category, Concept, Subcategory
# print(len(EDA_df['Category'].unique()))
# print(len(EDA_df['Concept'].unique()))
# print(len(EDA_df['Subcategory'].unique()))
# print(len(EDA_df['Uid'].unique()))

# # check if there is any missing value
# print(EDA_df.isnull().sum())

# # check if Uid appears in test but not in train
# train_uid = set(EDA_df['Uid'].unique())
# test_uid = set(test_x['Uid'].unique())
# print(len(test_uid - train_uid))

# # check if Concept appears in test but not in train
# train_concept = set(EDA_df['Concept'].unique())
# test_concept = set(test_x['Concept'].unique())
# print(len(test_concept - train_concept))

# # check if Category appears in test but not in train
# train_category = set(EDA_df['Category'].unique())
# test_category = set(test_x['Category'].unique())
# print(len(test_category - train_category))

# # check if Subcategory appears in test but not in train
# train_subcategory = set(EDA_df['Subcategory'].unique())
# test_subcategory = set(test_x['Subcategory'].unique())
# print(len(test_subcategory - train_subcategory))


# Feature Engineering
## Time Feature

In [33]:
def time_feature_engineering(df, start_t):
    df['Postdate'] = pd.to_datetime(df['Postdate'], unit='s')

    # groupby weekday and encode in 7 dimension
    df['weekday'] = df['Postdate'].dt.weekday
    weekday_df = pd.get_dummies(df['weekday']).astype(int)
    df = pd.concat([df, weekday_df], axis=1)

    combined_categories = [i for i in range(1, 13)]
    encoder = OneHotEncoder(categories=[combined_categories], sparse=False)
    encoder.fit([[cat] for cat in combined_categories]) 
    df['month'] = df['Postdate'].dt.month
    train_encoded = pd.DataFrame(encoder.transform(df[['month']]), columns=encoder.get_feature_names_out(['month'])).astype(int)
    df = pd.concat([df, train_encoded], axis=1)

    # transform hour to morning (06:00 to 11:59), afternoon (12:00 to 17:59), evening (18:00 to 23:59), or night (00:00 to 05:59).
    df['hour'] = df['Postdate'].dt.hour
    df['morning'] = df['hour'].apply(lambda x: 1 if 6 <= x < 12 else 0)
    df['afternoon'] = df['hour'].apply(lambda x: 1 if 12 <= x < 18 else 0)
    df['evening'] = df['hour'].apply(lambda x: 1 if 18 <= x < 24 else 0)
    df['night'] = df['hour'].apply(lambda x: 1 if 0 <= x < 6 else 0)

    # get post duration
    if start_t is None:
        start_t = df['Postdate'].min()
    df['Postduration'] = df['Postdate'] - start_t
    df['Postduration'] = df['Postduration'].dt.total_seconds()
    df = df.drop(columns=['weekday', 'month', 'hour', 'Postdate'])
    
    return df, start_t

## Texual Embedding

In [34]:
def get_textual_embeddings(data):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device = 'cuda')
#     model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode(data['all_text'].tolist())

    # reduce embedding dimension
#     svd = TruncatedSVD(n_components=80, n_iter=2, random_state=42)
#     embeddings = svd.fit_transform(embeddings)
    columns = ['text_embedding_' + str(i) for i in range(384)]
    embeddings = pd.DataFrame(embeddings, columns=columns)
    data[columns] = embeddings

    # add tf-idf
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    tfidf = vectorizer.fit_transform(data['all_text'].tolist())
    tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())
    data = pd.concat([data, tfidf], axis=1)

    data['Concept_words'] = data['Concept'].apply(lambda x: x.split())

    model = Word2Vec(sentences=data['Concept_words'], vector_size=100, window=5, min_count=1)
    model.train(data['Concept_words'], total_examples=len(data['Concept_words']), epochs=10)
    embeddings = model.wv.vectors
    c_columns = ['concept_embedding_' + str(i) for i in range(100)]
    embeddings = pd.DataFrame(embeddings, columns=c_columns)
    data[c_columns] = embeddings
    data.drop(['Concept_words'], axis=1, inplace=True)

    c_columns.extend(columns)
    return data, c_columns

## Categorial Embedding

In [35]:
# get categorical embedding by PCA

def PCA_embedding(data, columns, n_components = 10, pca_model=None, encoder=None):
    # One-hot encode the categories that appear in both training and test datasets
    if encoder:
        encoded_data = encoder.transform(data[[columns]])
    else:
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        encoded_data = encoder.fit_transform(data[[columns]])
    
    if pca_model == None:
        pca_model = PCA(n_components=n_components)
        pca_result = pca_model.fit_transform(encoded_data)
    else:
        # Apply PCA model trained on training data to test data
        pca_result = pca_model.transform(encoded_data)
    
    columns = ['{}_Embedding_'.format(columns) + str(i) for i in range(n_components)]
    pca_result = pd.DataFrame(pca_result, columns=columns)
    data[columns] = pca_result

    return data, pca_model, encoder

# Normalization

In [36]:
def numerical_feature_normalization(df, columns, mean=None, std=None):
    if mean is None:
        mean = df[columns].mean()
        std = df[columns].std()
        df[columns] = (df[columns] - mean) / std
        return df, mean, std
    df[columns] = (df[columns] - mean) / std
    return df

# Preprocessing

In [37]:
def feature_engineering(df, label_encoder=None, embeddings=None, train_cat_df=None, start_time=None):
    
    df['tags_count'] = df['Alltags'].apply(lambda x: len(x.split(' ')))
    df['title_len'] = df['Title'].apply(lambda x: len(x))
    df['all_text'] = df['Title'] + ' ' + df['Alltags']

#     get textual embeddings
    df, text_embedding_columns = get_textual_embeddings(df)
    # time feature engineering
    df, start_time = time_feature_engineering(df, start_time)

    n_comp = {'Concept':30, 'Subcategory':30, 'Uid':20}
    embeddings = []
    label_encoder = {}
    # encoding_col = ['Concept']
    encoding_col = []
#     get Category embeddings
    if len(embeddings) > 0:
        for i, col in enumerate(encoding_col):
            df, _, _ = PCA_embedding(df, col, n_components=n_comp[col], pca_model=embeddings[i], encoder=label_encoder[col])
    else:
        for i, col in enumerate(encoding_col):
            df, concept_embeddings, concept_encoder = PCA_embedding(df, col, n_components=n_comp[col])
            embeddings.append(concept_embeddings)
            label_encoder[col] = concept_encoder

    # split dataset by category into different dataframes
    category_df = {}
    for cat in df['Category'].unique():
        category_df[cat] = {'df': df[df['Category'] == cat]}
        category_df[cat]['df'].drop('Category', axis=1, inplace=True)
        # one hot encoder ignore unknown categories
        if train_cat_df is not None:
            encoder = train_cat_df[cat]['encoder']
        else:
            encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
            encoder.fit([[subcat] for subcat in category_df[cat]['df']['Subcategory']])
        encoded_data = encoder.transform(category_df[cat]['df'][['Subcategory']])
        category_df[cat]['encoder'] = encoder
        # concat the encoded data with the original data
        columns = ['Subcategory_' + str(i) for i in range(encoded_data.shape[1])]
        encoded_data = pd.DataFrame(encoded_data, columns=columns)
        encoded_data.set_index(category_df[cat]['df'].index, inplace=True)
        category_df[cat]['df'] = pd.concat([category_df[cat]['df'], encoded_data], axis=1)
        category_df[cat]['df'].drop('Subcategory', axis=1, inplace=True)
        category_df[cat]['df'].drop([ 'Pid', 'Title', 'Alltags', 'Uid', 'Concept', 'all_text'], axis=1, inplace=True)
        

    concept_embedding_columns = ['Concept_Embedding_{}'.format(i) for i in range(n_comp['Concept'])]

    numeric_feature = ['tags_count', 'title_len', 'Uid_freq', 'Postduration']
#     numeric_feature.extend(text_embedding_columns)
    for i in [text_embedding_columns]:
#     for i in [concept_embedding_columns]:
        numeric_feature.extend(i)
    # categorial_feature = ['Category_Animal', 'Category_Electronics', 'Category_Entertainment', 'Category_Family', 'Category_Fashion', 'Category_Food', 'Category_Holiday&Celebrations', 'Category_Social&People', 'Category_Travel&Active&Sports', 'Category_Urban', 'Category_Whether&Season', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'morning', 'afternoon', 'evening', 'night']

    for cat in category_df.keys():
        if train_cat_df is None:
            category_df[cat]['df'], train_mean, train_std = numerical_feature_normalization(category_df[cat]['df'], numeric_feature)
            category_df[cat]['mean'] = train_mean
            category_df[cat]['std'] = train_std
        else:
            category_df[cat]['df'] = numerical_feature_normalization(category_df[cat]['df'], numeric_feature, train_cat_df[cat]['mean'], train_cat_df[cat]['std'])
    
    if train_cat_df is None:
        return category_df, label_encoder, embeddings, start_time
    return category_df

# Models
## MultiModal (CNN + Feature)

# Load Images

In [38]:
class MultimodalDataset(Dataset):
    def __init__(self, x, image_filepath, y=None, transform=None):
        self.image_paths = image_filepath  # Adjust this to match the column name containing image paths
        self.dataframe = x
        self.transform = transform
        self.y = y

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Extract information for the current row
        image_path = self.image_paths[int(idx)]
        
        # Load the image
        image = Image.open(image_path)  # Ensure RGB mode
        # Apply transformations if specified
        if self.transform:
            image = self.transform(image)
        if(image.shape[0]):
            image = image.expand(3, -1, -1)
        # print(image)
        # Extract other relevant data from the row if needed
        # For example: label = row['label']
        
        # Return image and other data
        if self.y is None:
            return self.dataframe[int(idx)], image
        else:
            return self.dataframe[int(idx)], image, self.y[int(idx)]

### def load_data

In [39]:
# Load torchvision datasets like CIFAR10 and MNIST
def load_data(data,
              img_filepath_index,
              y=None,
              augs=None,
              resize=(128,128),
              channel_mean=None,
              channel_stdd=None):
    # Normalization step in image processing for neural networks is
    # usually performed after an image has been converted into a tensor
    # using transforms.ToTensor(). This converts the image from a PIL Image or
    # numpy array with dimensions (Height, Width, Channels) and pixel
    # range [0, 255] to a float tensor with dimensions (Channels, Height, Width)
    # and pixel values scaled down to the range [0.0, 1.0].
    if channel_mean and channel_stdd:
      base_transform = transforms.Compose([
          torchvision.transforms.Resize(resize),
          # Convert to tensor
          transforms.ToTensor(),  # data scaled to 0 to 1
          # Normalize (example values for normalization, should be adjusted based on dataset)
          transforms.Normalize(channel_mean, channel_stdd) # rescaled to -1 to 1
        ])
    else:
      base_transform = transforms.Compose([
         torchvision.transforms.Resize(resize),
          transforms.ToTensor(),
      ])

    if augs:
      transform = transforms.Compose([
          # Augmentations
          *augs.transforms,
          # Preprocessing
          *base_transform.transforms,
        ])
    else:
      transform = base_transform
    image_path = data[:, 0]
        
    data = torch.tensor(data[:, 1:].astype(np.float32), dtype=torch.float32)

    if y is not None:
      y = torch.tensor(y, dtype=torch.float32)
      trainset = MultimodalDataset(data, image_path, y, transform=transform)
      return trainset
    trainset = trainset = MultimodalDataset(data, image_path, transform=transform)
    return trainset

# For visualizing images
def unnormalize(img):
    img = img / 2 + 0.5     # unnormalize
    return img

### Visualization

In [40]:
## Let's see some images
def imshow(img):
    npimg = img.numpy()  # convert PyTorch tensor to numpy
    if len(npimg.shape) == 3:  # RGB image
        plt.imshow(np.transpose(npimg, (1, 2, 0)))  # transpose (C, H, W) to (H, W, C)
    else:  # Grayscale image
        plt.imshow(npimg[0], cmap='gray')  # display the first channel
    plt.show()

### Check pictures

In [41]:
# Define transformations to preprocess the images

# Randomly select a few images from the training set

# trainset = load_data(train_x.values,
#                     train_y,
#                     resize=(256, 256),
#                     channel_mean=(0.5,0.5,0.5),
#                     channel_stdd=(0.5,0.5,0.5))

# indices = torch.randperm(len(trainset))[:4]

# # Prepare images and labels
# images = torch.stack([trainset[i][1] for i in indices])

# # Show images and labels
# # create grid of images
# img_grid = torchvision.utils.make_grid(images)
# # print(img_grid)
# img_grid.unnorm = unnormalize(img_grid)
# imshow(img_grid.unnorm)

# Main

## Preprocess

In [42]:
random_seed = 42
test_Pid = test_x['Pid']
# count Uid frequency concatenate with train_x and test_x
concat_df = pd.concat([train_x, test_x])
Uid_freq = concat_df['Uid'].value_counts()
train_x['Uid_freq'] = train_x['Uid'].map(Uid_freq)
test_x['Uid_freq'] = test_x['Uid'].map(Uid_freq)

train_x, label_encoder, embeddings, start_t = feature_engineering(train_x)
test_x = feature_engineering(test_x, label_encoder, embeddings, train_x, start_t)
img_filepath_index = train_x['Urban']['df'].columns.get_loc('img_filepath')

y_train = train_y['label']
y_train_cat = {}
for cat in train_x.keys():
    # Split y_train according to the category
    y_train_cat[cat] = y_train[train_x[cat]['df'].index].values.reshape(-1)
    train_x[cat]['df'] = train_x[cat]['df'].values
    test_x[cat]['df'] = test_x[cat]['df'].values
    
# train_x.to_csv('/kaggle/working/train_x.csv',index=False)
# test_x.to_csv('/kaggle/working/test_x.csv',index=False)


Batches:   0%|          | 0/469 [00:00<?, ?it/s]

  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns] = embeddings
  data[columns

In [43]:
for cat in train_x.keys():
    print(cat)
    print(train_x[cat]['df'].shape)
    print(test_x[cat]['df'].shape)


Food
(901, 1514)
(283, 1514)
Travel&Active&Sports
(3513, 1530)
(1619, 1530)
Whether&Season
(958, 1517)
(317, 1517)
Animal
(914, 1514)
(622, 1514)
Entertainment
(1319, 1513)
(448, 1513)
Family
(196, 1511)
(34, 1511)
Social&People
(1100, 1516)
(477, 1516)
Holiday&Celebrations
(2752, 1517)
(367, 1517)
Electronics
(239, 1510)
(119, 1510)
Fashion
(2262, 1523)
(348, 1523)
Urban
(846, 1511)
(366, 1511)


In [44]:
# torch.cuda.empty_cache()
# torch.cuda.memory_summary(device=None, abbreviated=False)

## Tuning

In [45]:
resize=(128,128)

for cat in train_x.keys():
    train_x[cat]['dataset'] = load_data(train_x[cat]['df'],
              img_filepath_index,
              y_train_cat[cat],
              augs=None,
              resize=resize,
              channel_mean=None,
              channel_stdd=None)
    
    test_x[cat]['dataset'] = load_data(test_x[cat]['df'],
                img_filepath_index,
                augs=None,
                resize=resize,
                channel_mean=None,
                channel_stdd=None)

def data_loader(train_dataset, image_model, sk_svd=TruncatedSVD(n_components=30, n_iter=2, random_state=42), test=False):
    total_feature = []
    target = []
    image_model.eval()
    num=train_dataset.__len__()
    factors=[]
    print(num)

    if num>=1000:
        batch_size = num//(4*(num//1000))
    elif num//4 < 30:
        batch_size = 30
    else:
        batch_size = num//4
    while((num % batch_size < 30 and num% batch_size != 0)):
        batch_size += 1
#     if test:
#         batch_size = factors[-1]
#     else:
#         if num>=1000:
#             batch_size = num//(4*(num//1000))
#         else:
#             batch_size = num//4
    

    train_dataset = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

    for i, val in enumerate(train_dataset):
        image_input = val[1].to(device)
        image_features = image_model(image_input)
        # image_features = torch.tensor(sk_svd.fit_transform(image_features.cpu().detach().numpy()))
        # image_features = torch.tensor(sk_svd.fit_transform(image_features.detach().numpy()))
        # Keep only the top 30 singular values and corresponding vectors
        # print(val[0])
        total_feature.append(torch.cat((val[0], torch.tensor(image_features.cpu().detach().numpy())), dim=1))
        del image_features, image_input
        gc.collect()
        if not test:
            target.append(val[2])
    total_feature = torch.cat(total_feature, dim=0)
    print('total.shape:', total_feature.shape)
    if not test:
        target = torch.cat(target, dim=0)
        print('target.shape:', target.shape)
        return total_feature, target  
    return total_feature

resnet_ = models.resnet50(pretrained=True).to(device)
for cat in train_x.keys():
    train_dataset = train_x[cat]['dataset']
    total_feature, target = data_loader(train_dataset, resnet_)
    train_x[cat]['total_feature'] = total_feature
    train_x[cat]['target'] = target.detach().numpy()
    del train_dataset
    print('{} finished'.format(cat))
print('train_finished')

for cat in test_x.keys():
    test_dataset = test_x[cat]['dataset']
    total_feature_test = data_loader(test_dataset, resnet_, test=True)
    test_x[cat]['total_feature'] = total_feature_test
    del test_dataset
    print('{} finished'.format(cat))
print('test_finished')



901
total.shape: torch.Size([901, 2513])
target.shape: torch.Size([901])
Food finished
3513
total.shape: torch.Size([3513, 2529])
target.shape: torch.Size([3513])
Travel&Active&Sports finished
958
total.shape: torch.Size([958, 2516])
target.shape: torch.Size([958])
Whether&Season finished
914
total.shape: torch.Size([914, 2513])
target.shape: torch.Size([914])
Animal finished
1319
total.shape: torch.Size([1319, 2512])
target.shape: torch.Size([1319])
Entertainment finished
196
total.shape: torch.Size([196, 2510])
target.shape: torch.Size([196])
Family finished
1100
total.shape: torch.Size([1100, 2515])
target.shape: torch.Size([1100])
Social&People finished
2752
total.shape: torch.Size([2752, 2516])
target.shape: torch.Size([2752])
Holiday&Celebrations finished
239
total.shape: torch.Size([239, 2509])
target.shape: torch.Size([239])
Electronics finished
2262
total.shape: torch.Size([2262, 2522])
target.shape: torch.Size([2262])
Fashion finished
846
total.shape: torch.Size([846, 2510])


In [46]:
# pd.DataFrame(total_feature).to_csv('/kaggle/working/total_feature.csv',index=False)
# pd.DataFrame(target).to_csv('/kaggle/working/target.csv',index=False)
# pd.DataFrame(total_feature_val).to_csv('/kaggle/working/total_feature_val.csv',index=False)
# pd.DataFrame(target_val).to_csv('/kaggle/working/target_val.csv',index=False)
# pd.DataFrame(total_feature_test).to_csv('/kaggle/working/total_feature_test.csv',index=False)

## Training

In [47]:
# for cat in train_x.keys():
#     model_RF = RandomForestRegressor(n_estimators=400, max_depth=5, random_state=42)
#     print('model_ready')
#     model_RF.fit(train_x[cat]['total_feature'].cpu(), train_x[cat]['target'])
#     print('RF_fit_done')
#     train_x[cat]['model'] = model_RF

In [48]:
# for cat in train_x.keys():
#     model_xgb = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1)
#     print('model_ready')
#     model_xgb.fit(train_x[cat]['total_feature'].cpu(), train_x[cat]['target'])
#     print('RF_fit_done')
#     train_x[cat]['model'] = model_xgb

In [49]:
torch.cuda.empty_cache()
gc.collect()

0

In [51]:

for cat in train_x.keys():
    model_ctb = ctb.CatBoostRegressor(iterations=600,
                          learning_rate=0.1,
                          depth=10,
                        task_type="GPU",
                          loss_function='RMSE',
                        l2_leaf_reg = 3, 
                          verbose=True)
    print('model_ready')
    model_ctb.fit(train_x[cat]['total_feature'].cpu().detach().numpy(), train_x[cat]['target'])
    print('cat_fit_done')
    train_x[cat]['model'] = model_ctb

model_ready
0:	learn: 2.2284332	total: 443ms	remaining: 4m 25s
1:	learn: 2.1719372	total: 781ms	remaining: 3m 53s
2:	learn: 2.1121955	total: 1.11s	remaining: 3m 41s
3:	learn: 2.0589225	total: 1.45s	remaining: 3m 36s
4:	learn: 2.0192958	total: 1.79s	remaining: 3m 33s
5:	learn: 1.9705164	total: 2.13s	remaining: 3m 30s
6:	learn: 1.9301658	total: 2.47s	remaining: 3m 29s
7:	learn: 1.9008756	total: 2.81s	remaining: 3m 27s
8:	learn: 1.8520995	total: 3.15s	remaining: 3m 26s
9:	learn: 1.8151515	total: 3.49s	remaining: 3m 26s
10:	learn: 1.7960367	total: 3.83s	remaining: 3m 24s
11:	learn: 1.7654231	total: 4.16s	remaining: 3m 24s
12:	learn: 1.7405393	total: 4.5s	remaining: 3m 23s
13:	learn: 1.7168810	total: 4.83s	remaining: 3m 22s
14:	learn: 1.7016071	total: 5.16s	remaining: 3m 21s
15:	learn: 1.6829603	total: 5.49s	remaining: 3m 20s
16:	learn: 1.6611273	total: 5.82s	remaining: 3m 19s
17:	learn: 1.6417123	total: 6.16s	remaining: 3m 19s
18:	learn: 1.6180612	total: 6.49s	remaining: 3m 18s
19:	learn: 

In [None]:
# concat train and val

# total_feature_full = np.concatenate((total_feature, total_feature_val), axis=0)
# target_full = np.concatenate((target, target_val), axis=0)
# model_xgb = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1)
# model_xgb.fit(total_feature, target)

# Testing

In [53]:


# print(test_loader)

# model.eval()

# test_loader = DataLoader(test_dataset,
#                           shuffle=True,
#                           batch_size=batch_size,
#                           num_workers=2)

test_x_origin = pd.read_json('{}data/test_data.json'.format(path))
test_x_origin['img_filepath'] = test_x_origin['img_filepath'].apply(lambda x: '{}data/'.format(path) + x)
# test_prediction = []

# with torch.no_grad():
#   # iterate over the DataLoader for training data
#   for i, data in enumerate(test_loader,0):
#     input_feature = data[0].to(device)
#     input_image = data[1].to(device)
#     # Now, let's see what the network thinks these examples are
#     prediction = model(input_image, input_feature)
#     for i in prediction:
#         test_prediction.append(i.item())
for cat in test_x.keys():
    test_x[cat]['prediction'] = train_x[cat]['model'].predict(test_x[cat]['total_feature'].cpu().detach().numpy())
# test_prediction = model_xgb.predict(total_feature_test)

# reconstruct the test_prediction according to order
test_prediction = []
for i in test_x_origin['img_filepath']:
    for cat in test_x.keys():
        if i in test_x[cat]['df'][:, 0]:     
            test_prediction.append(test_x[cat]['prediction'][np.where(test_x[cat]['df'][:, 0] == i)][0])
            break

# print(len(test_prediction))
# convert test_ids to a dataframe
submission = pd.DataFrame({'Pid': test_x_origin['Pid']})
submission['label'] = test_prediction
print(submission)
print(submission.shape)
submission.to_csv('/kaggle/working/predictions.csv',index=False)

          Pid     label
0       56783  8.022300
1       75638  7.635104
2       82051  6.768583
3      381878  7.627807
4      382850  5.985416
...       ...       ...
4995  1066395  6.796783
4996   494492  7.364872
4997   494495  4.815458
4998   737606  6.391252
4999   494555  6.069927

[5000 rows x 2 columns]
(5000, 2)
