In [None]:
#default_exp process_duplicates_image_level

# Process Duplicates Image-Level

> Process duplicate images by deleting them / doing other stuff to them.

In [None]:
#hide 
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
from cdiscount.find_duplicates import set_index_and_sort
import pandas as pd
from PIL import Image

For **image-level** predictions, I will do the following:
- Labeled Test CSV: Create CSV mapping test img_name -> Prob. dist. of classes (proportional to appearances in train)
    - For test imgs duplicated in train
- To-Predict Test CSV: test img_name's for test images that don't appear in train
    - This should be disjoint with *Labeled Test CSV* and their union should be all test images
- Train CSV: map train img_name -> category_id (for train images that **don't** appear in test)
    - We won't train on images that appear in train **and** test

## Images That Appear in Train & Test

In [None]:
path = Path("data"); path.ls()

(#17) [Path('data/multiple_categories.csv'),Path('data/sample_submission.csv'),Path('data/train.bson'),Path('data/train_hashes.csv'),Path('data/train_example.bson'),Path('data/cdiscount-image-classification-challenge.zip'),Path('data/TESTING'),Path('data/multiple_categories_and_in_train_and_test.csv'),Path('data/test.bson'),Path('data/test.csv')...]

In [None]:
train_hashes_df,test_hashes_df = L("train_hashes.csv", "test_hashes.csv").map(
                                lambda f: pd.read_csv(path/f, index_col=["image_hash", "image_name"]).sort_index())

In [None]:
train_hashes_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00000233ee2cc95eb7b167886a48036f,13985988_0.jpg,13985988,1000003400,False
00000278a576a7ea895463ab7485dbf1,18018867_0.jpg,18018867,1000010053,False
000003f197ae3ff412661ce81682858c,20284550_3.jpg,20284550,1000002123,False
00000617fcf52b972bc2946482141f3d,20145338_0.jpg,20145338,1000017183,False
000006568eba171364889efc5a2c0e4f,18164099_0.jpg,18164099,1000017959,False


In [None]:
test_hashes_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0000098fc6dcc760436f60a96b3a2a07,20340372_3.jpg,20340372,-1,True
000009c753e8aad5d04cd94a8962d8a5,22403286_0.jpg,22403286,-1,True
00000e3c94a395703cc6c068e9f6bdd6,23204684_2.jpg,23204684,-1,True
0000126e6e04aef6f446b7150175f2a9,4654806_1.jpg,4654806,-1,True
0000145bdf6e46ee11e292eaaf33ebde,16599078_0.jpg,16599078,-1,True


In [None]:
#export
def get_duplicated_image_idxs(df_to_get_idxs_for, other_df):
    return df_to_get_idxs_for.index.get_level_values("image_hash").isin(other_df.index.get_level_values("image_hash"))

In [None]:
test_imgs_in_train_idx = get_duplicated_image_idxs(test_hashes_df, train_hashes_df); test_imgs_in_train_idx.sum()

1312629

In [None]:
train_imgs_in_test_idx = get_duplicated_image_idxs(train_hashes_df, test_hashes_df); train_imgs_in_test_idx.sum()

4040161

In [None]:
len(test_imgs_in_train_idx)

3095080

In [None]:
#export
def _get_unique_hashes(df): return df.index.get_level_values("image_hash").unique()

In [None]:
unique_hashes = _get_unique_hashes(test_hashes_df[test_imgs_in_train_idx]); len(unique_hashes)

ValueError: Item wrong length 3095080 instead of 50.

In [None]:
set(_get_unique_hashes(train_hashes_df[train_imgs_in_test_idx])) == set(unique_hashes)

So the unique hashes can be got either way (by checking which test_hashes are in train or which train hashes are in test).
They just differ in number of non-unique hashes b/c:
- Test hashes that also appear in train is a subset of test hashes
- Train hashes that also appear in test is a subset of train hashes
Since train is larger, there will probably be (& is) more duplicated images in train than in test. For example, an image of a cat may appear 10 times in the train set but just twice in the test set.

Additionally, it looks like there are just 475430 unique duplicated images. These images are duplicated both in and across each set.

## Create *Labeled Test CSV*

To create *Labeled Test CSV*:
```
For each test_img_in_train
    Look up that img_hash in train
    Map test img_name to list of duplicated imgs in train
```

In [None]:
test_imgs_in_train_df = test_hashes_df[test_imgs_in_train_idx].iloc[:100]; test_imgs_in_train_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0000145bdf6e46ee11e292eaaf33ebde,16599078_0.jpg,16599078,-1,True
0000662f4f30d0db5810955b46a5a999,4214251_0.jpg,4214251,-1,True
000085ad497bcf27b17fed25ae8c247d,1093986_0.jpg,1093986,-1,True
0000a6a16d98c4966fa401d3f891aa0d,4627059_1.jpg,4627059,-1,True
0000b253ceda4bac003c98be2d5c04bb,3296897_3.jpg,3296897,-1,True


This method below is better because I first group `test_imgs_in_train_df` by `image_hash`. This reduces computation because now I only have to look up each test `image_hash` in `train_hashes_df` once, rather than once for each test image sharing that `image_hash`.

In [None]:
df = test_imgs_in_train_df.iloc[-3:-1]; df

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,category_id,in_test
image_hash,image_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0008fa57720636a90d53314514928cef,11181570_1.jpg,11181570,-1,True
0008fa57720636a90d53314514928cef,18703793_1.jpg,18703793,-1,True


In [None]:
h = df.index[0][0]; h

'0008fa57720636a90d53314514928cef'

In [None]:
train_categories = train_hashes_df.loc[h]["category_id"].to_list(); train_categories

[1000012965]

In [None]:
pd.DataFrame([[[0,1]] for _ in range(len(df))], index=df.index.get_level_values("image_name"))

Unnamed: 0_level_0,0
image_name,Unnamed: 1_level_1
11181570_1.jpg,"[0, 1]"
18703793_1.jpg,"[0, 1]"


In [None]:
_get_test_img_to_train_categories_helper(df, train_hashes_df)

Unnamed: 0_level_0,0
image_name,Unnamed: 1_level_1
11181570_1.jpg,[1000012965]
18703793_1.jpg,[1000012965]


In [None]:
#export
def _get_test_img_to_train_categories_helper(test_imgs_group_df, train_hashes_df) -> pd.DataFrame:
    """Takes a DataFrame of test images all with the same hash and returns image names mapped to train categories."""
    test_img_hash = test_imgs_group_df.index[0][0]  # All images in group share same image_hash
    train_categories = train_hashes_df.loc[test_img_hash]["category_id"].to_list()
    data = [[train_categories] for _ in range(len(test_imgs_group_df))]
    return pd.DataFrame(data, index=test_imgs_group_df.index.get_level_values("image_name"), columns=["category_ids"])

In [None]:
#export
def get_test_imgs_to_train_categories(test_imgs_in_train_df, train_hashes_df):
    func = partial(_get_test_img_to_train_categories_helper, train_hashes_df=train_hashes_df)
    return test_imgs_in_train_df.groupby(level="image_hash", group_keys=False).apply(func)

# TODO
- Find way to make this faster by maybe indexing into `train_hashes_df` better

In [None]:
%time get_test_imgs_to_train_categories(test_imgs_in_train_df, train_hashes_df)

KeyError: '0000145bdf6e46ee11e292eaaf33ebde'

## Create *To-Predict CSV*

This is just test_imgs in test_hashes that are not in test_imgs_in_train_df.
Similar process for *Train CSV*: This is train_imgs not in train_imgs_in_test_df.

In [None]:
to_predict_df = test_hashes_df.reset_index()[~test_imgs_in_train_idx].image_name
to_predict_df.head(), len(to_predict_df)

(0    20340372_3.jpg
 1    22403286_0.jpg
 2    23204684_2.jpg
 3     4654806_1.jpg
 5    13183690_2.jpg
 Name: image_name, dtype: object,
 1782451)

## Train CSV

This contains the train images that aren't in the test set.

In [None]:
train_non_duplicated_df = train_hashes_df.reset_index()[~train_imgs_in_test_idx][["image_name", "category_id"]] 
train_non_duplicated_df.head(), len(train_non_duplicated_df)

(       image_name  category_id
 0  13985988_0.jpg   1000003400
 1  18018867_0.jpg   1000010053
 2  20284550_3.jpg   1000002123
 3  20145338_0.jpg   1000017183
 4  18164099_0.jpg   1000017959,
 8331132)

## Script

In [None]:
#export
@call_parse
def get_image_level_csvs(path: Param("Path to dir containing train_hashes.csv and test_hashes.csv", Path)="."):
    """Process duplicated images for image-level predictions.
    
    Saves the following CSVs in folder called "image_level_csv":
    - test_labeled.csv:      Test images duplicated in train, mapped to list of labels in train. 
    - test_to_predict.csv:   Test images not duplicated in train, to predict on.
    - train_non_duplicated.csv: Train images not duplicated in test, to train on.
    """
    train_hashes_df,test_hashes_df = L("train_hashes.csv", "test_hashes.csv").map(
                                lambda f: pd.read_csv(path/f, index_col=["image_hash", "image_name"]).sort_index())
    test_imgs_in_train_idxs = get_duplicated_image_idxs(test_hashes_df, train_hashes_df)
    train_imgs_in_test_idxs = get_duplicated_image_idxs(train_hashes_df, test_hashes_df)
    assert (set(_get_unique_hashes(test_hashes_df[test_imgs_in_train_idxs])) == 
            set(_get_unique_hashes(train_hashes_df[train_imgs_in_test_idxs])))
    
    # Create save dir
    save_path = path/"image_level_csvs"
    save_path.mkdir(exist_ok=True)
    
    print("Creating test_labeled.csv")
    test_labeled_df = get_test_imgs_to_train_categories(test_hashes_df[test_imgs_in_train_idxs], train_hashes_df)
    test_labeled_df.to_csv(save_path/"test_labeled.csv", index=False)
    print("Done")
    
    print("Creating test_to_predict.csv")
    test_to_predict_df = test_hashes_df.reset_index()[~test_imgs_in_train_idxs]["image_name"]
    test_to_predict_df.to_csv(save_path/"test_to_predict.csv", index=False)
    print("Done")
    
    # Check that test_labeled and test_to_predict are disjoint on image_name
    assert len(set(test_labeled_df.image_name) & set(test_to_predict_df)) == 0, "Overlap b/t test_labeled and test_to_predict"
    
    
    print("Creating train_non_duplicated.csv")
    train_non_duplicated_df = train_hashes_df.reset_index()[~train_imgs_in_test_idxs][["image_name", "category_id"]]
    train_non_duplicated_df.to_csv(save_path/"train_non_duplicated.csv", index=False)
    print("Done")
    
    print("Script completed.")
    return test_labeled_df, test_to_predict_df, train_non_duplicated_df

In [None]:
#slow
%time test_labeled_df, test_to_predict_df, train_non_duplicated_df = get_image_level_csvs(path)

Creating test_labeled.csv


KeyboardInterrupt: 

In [None]:
all_test_imgs_df = pd.concat([test_labeled_df["image_name"], test_to_predict_df], ignore_index=True)

In [None]:
len(all_test_imgs_df)

In [None]:
test_hashes_df.image_name == all_test_imgs_df

In [None]:
test_labeled_df

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()