In [None]:
#default_exp process_duplicates

# Process Duplicates

> Process duplicate images by deleting them / doing other stuff to them.

In [None]:
#hide 
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
from cdiscount.find_duplicates import set_index_and_sort
import pandas as pd

Doing product-level predictions complicates if you want to keep or remove images. Since I plan to do product-level predictions, I will just do the following:
1. For images that appear in a single category in train and in test:
    - Label those products in test, and remove those products from test set
    - Check how many products of this category in train have this image
        - If all products of this category contain this image, remove all those products from train set too
2. For images that appear in multiple categories in train:
    - And doesn't appear in test:
        - Remove all instances of that image from train (b/c it doesn't help us label anything in test)
    - And does appear in test:
        - Leave them and do nothing -> network should learn class distribution of those images and, combined with product's other images, predict a good guess

## Load Duplicate CSVs

In [None]:
path = Path("../data/TESTING"); path.ls()

(#9) [Path('../data/TESTING/multiple_categories.csv'),Path('../data/TESTING/train_hashes.csv'),Path('../data/TESTING/multiple_categories_and_in_train_and_test.csv'),Path('../data/TESTING/test.csv'),Path('../data/TESTING/test_hashes.csv'),Path('../data/TESTING/train.csv'),Path('../data/TESTING/train_example.csv'),Path('../data/TESTING/images'),Path('../data/TESTING/in_train_and_test.csv')]

In [None]:
multiple_categories_df = pd.read_csv(path/"multiple_categories.csv", index_col=["image_hash", "image_name"])
multiple_categories_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
03b1567f754209ebb3fcd99686e811bf,114_0.jpg,114,1000004079,False
03b1567f754209ebb3fcd99686e811bf,212_0.jpg,212,1000004079,False
03b1567f754209ebb3fcd99686e811bf,43_0.jpg,43,1000004079,False
03b1567f754209ebb3fcd99686e811bf,552_0.jpg,552,-1,True
5f4733d584087f7f8ecd89456d32ac58,114_2.jpg,114,1000004079,False


In [None]:
in_train_and_test_df = pd.read_csv(path/"in_train_and_test.csv", index_col=["image_hash", "image_name"])
in_train_and_test_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
03b1567f754209ebb3fcd99686e811bf,114_0.jpg,114,1000004079,False
03b1567f754209ebb3fcd99686e811bf,212_0.jpg,212,1000004079,False
03b1567f754209ebb3fcd99686e811bf,43_0.jpg,43,1000004079,False
03b1567f754209ebb3fcd99686e811bf,552_0.jpg,552,-1,True
5f4733d584087f7f8ecd89456d32ac58,114_2.jpg,114,1000004079,False


In [None]:
both_df = pd.read_csv(path/"multiple_categories_and_in_train_and_test.csv", index_col=["image_hash", "image_name"])
both_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
03b1567f754209ebb3fcd99686e811bf,114_0.jpg,114,1000004079,False
03b1567f754209ebb3fcd99686e811bf,212_0.jpg,212,1000004079,False
03b1567f754209ebb3fcd99686e811bf,43_0.jpg,43,1000004079,False
03b1567f754209ebb3fcd99686e811bf,552_0.jpg,552,-1,True
5f4733d584087f7f8ecd89456d32ac58,114_2.jpg,114,1000004079,False


## Images that appear in a single category in train and in test

In [None]:
in_train_and_test_single_category_df = in_train_and_test_df[~in_train_and_test_df.index.isin(both_df.index)]
in_train_and_test_single_category_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


This gives us a way to label from test products and remove them from the test set.

### Checking proportion of train products in category that contain this image 

This will tell us if this product is useful to learn. If all the train product's of this category contain this image, then all products of this category in the test set will probably contain this image. If that is true, then we will have labeled all the products of this category in the test set. Hence, we can no longer need to predict this category and can remove all instances of this category in the train set.

For each `image_hash` in the previous table, we want to compute

$$\frac{\mathbf{# of products in train with image}}{\mathbf{# of products in train in total}}$$

, where both numbers are products of the same category.

In [None]:
both_df

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
03b1567f754209ebb3fcd99686e811bf,114_0.jpg,114,1000004079,False
03b1567f754209ebb3fcd99686e811bf,212_0.jpg,212,1000004079,False
03b1567f754209ebb3fcd99686e811bf,43_0.jpg,43,1000004079,False
03b1567f754209ebb3fcd99686e811bf,552_0.jpg,552,-1,True
5f4733d584087f7f8ecd89456d32ac58,114_2.jpg,114,1000004079,False
5f4733d584087f7f8ecd89456d32ac58,212_2.jpg,212,1000004079,False
5f4733d584087f7f8ecd89456d32ac58,43_2.jpg,43,1000004079,False
5f4733d584087f7f8ecd89456d32ac58,552_2.jpg,552,-1,True
9683ff26cb34890105f00303b1b03deb,2_0.jpg,2,1000004079,False
9683ff26cb34890105f00303b1b03deb,433_0.jpg,433,-1,True


In [None]:
train_csv = pd.read_csv(path/".."/"train.csv"); train_csv.head() 

Unnamed: 0,_id,category_id
0,0,1000010653
1,1,1000010653
2,2,1000004079
3,3,1000004141
4,4,1000015539


In [None]:
def num_products_with_category(category_id: int): return (train_csv["category_id"] == category_id).sum()