In [1]:
import os
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import cv2

In [2]:
data_path = "../data/raw"

In [3]:
d_train = pd.read_csv(os.path.join(data_path, "new_training_set.csv"),
                      usecols=["title_1", "image_1", "title_2", "image_2", "Label"])

In [4]:
d_test = pd.read_csv(os.path.join(data_path, "new_test_sample.csv"),
                    names=["pair_index", "title_1", "image_1", "title_2", "image_2"], skiprows=1)

In [5]:
d_train.shape

(10181, 5)

In [6]:
d_test.shape

(207, 5)

In [7]:
d_train.head()

Unnamed: 0,title_1,image_1,title_2,image_2,Label
0,Johnson’s ® Top to Toe Hair & Body Bath 500ml,fdff8b9b8229da091dd7d070aae05f81.jpg,Johnson's cottontouch top to toe hair & body b...,41e191742760932598c7bd201e5dad47.jpg,0
1,Sandal Humble,906cc44f0be72d4e767669b5b63e3a17.jpg,Sandal Humble Glass - Glanzton,7a556b836bfdd08ea592216440524a34.jpg,0
2,PROMO LIKUID LIKUIT LIQUIT BABY POD LIQUID SAL...,475c26635de18b9f93032400732ff336.jpg,Voporizer Liquit - Likuit - Likuid - Liquid Pr...,ace93bec689f3f1565800c500a8341fa.jpg,0
3,6 Pasang / Set Anting Tusuk Bentuk Lingkaran A...,e630997f6217555d6026547ad1c15f0b.jpg,Subei 6 Pasang / Set Anting Tusuk Boho Bohemia...,31abbc176b09f5bd1728cfc3ecbbfb9c.jpg,0
4,ROREC NATURAL SKIN CARE MASK ROREC SHEET MASK ...,a27d11700a7902febd039dc3a96f10f2.jpg,Rorec 86 Natural Skin Care Shert Mask All Variant,813ad9dd638c10f1765db9dde20c9e42.jpg,1


In [8]:
d_test.head()

Unnamed: 0,pair_index,title_1,image_1,title_2,image_2
0,0,12.12 SUPER PROMO !! Sandal Jepit Pantai Fashion,83d1798fee1c90c2845204d9261169bb.jpg,Clarisse CRAZY OFFER Beli 5 Dapet 12 POLKA SUM...,caba83a8a7f9def9c4d268b6c34da7f4.jpg
1,1,Damai fashion jakarta - long dress JUMBO wanit...,126868769ca4a4694d36d28960f9de8a.jpg,[VIP] kasih fashion jakarta - long dress JUMBO...,7fdfe855a7be9c87238757c43b712b81.jpg
2,2,My Baby Minyak Telon 145 ML,86aee3dc281911f5f9d50fea17b978f0.jpg,My Baby Minyak Telon Plus 145 Ml 4btl kemasan ...,0ec544d3d4169df76ae156e76c724f0c.jpg
3,3,Creative Waterborne Marker Very Fine Double - ...,40ef98354335cf4780937da703ed6d65.jpg,SOMETHINC BROW WIZ Retractable Eyebrow,9cf798e5f940429f14b4af0fd48992a4.jpg
4,4,Goblin♛ COD Tas Ransel Anak Sekolah Karakter K...,c28512df97d0fc1d61fd30de966e01c3.jpg,TAS KARAKTER ANAK LUCU,a4c09a46d8b1adda43a2433d40bba583.jpg


In [9]:
d_train.isna().sum()

title_1    0
image_1    0
title_2    2
image_2    0
Label      0
dtype: int64

In [None]:
d_train.Label.value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.savefig('../figures/label_dist.png')

## Observation

In [None]:
for idx, row in enumerate(d_train.loc[d_train["Label"] == 1, ["title_1", "title_2", "Label"]].to_dict(orient='records')[:20]):
    print(idx)
    print(row['title_1'])
    print(row['title_2'])
    print("\n")

In [None]:
fig, ax = plt.subplots(3,2, figsize=(8,12))
for idx, row in enumerate(d_train.loc[d_train["Label"] == 1].reset_index(drop=True).loc[12:14][["image_1", "image_2"]].to_dict(orient="records")):
    img_arr1 = plt.imread(os.path.join("../data/raw/training_img/training_img", row["image_1"]))
    img_arr2 = plt.imread(os.path.join("../data/raw/training_img/training_img", row["image_2"]))
    ax[idx, 0].imshow(img_arr1)
    ax[idx, 1].imshow(img_arr2)

In [None]:
fig.savefig("../figures/title_diff_label_1.png")

In [None]:
for row in d_train.loc[d_train["Label"] == 0, ["title_1", "title_2", "Label"]].to_dict(orient='records')[:20]:
    print(row['title_1'])
    print(row['title_2'])
    print("\n")

In [None]:
fig, ax = plt.subplots(8, 2, figsize=(8,32))
for idx, row in enumerate(d_train.loc[d_train["Label"] == 0].reset_index(drop=True).loc[[0,2,3,5,6,7,9,19], :].to_dict(orient='records')):
    img_arr1 = plt.imread(os.path.join("../data/raw/training_img/training_img", row['image_1']))
    img_arr2 = plt.imread(os.path.join("../data/raw/training_img/training_img", row['image_2']))
    ax[idx, 0].imshow(img_arr1)
    ax[idx, 0].axis('off')
    ax[idx, 1].imshow(img_arr2)
    ax[idx, 1].axis('off')

In [None]:
fig.savefig("../figures/title_same_label_0.png")

## Need to cleansing

- lower case
- add space before|after symbol like ][
- add space after number

In [10]:
d_train.title_1 = d_train.title_1.str.lower()
d_train.title_2 = d_train.title_2.str.lower()

In [11]:
d_train.dropna(inplace=True)

In [47]:
def text_cleansing(title):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    
    title = re.sub('(\d+)([a-zA-Z]+)', r'\1 \2', title)
    title = re.sub('\]', r' ] ', title)
    title = re.sub('\[', r' [ ', title)
    title = title.translate(table)
    title = re.sub(r'[^(a-z|A-Z|0-9)]', ' ', title)
    title = " ".join(title.split())
    
    return title

In [48]:
d_train['title_1_pre'] = d_train.title_1.apply(text_cleansing)
d_train['title_2_pre'] = d_train.title_2.apply(text_cleansing)

In [49]:
d_test['title_1_pre'] = d_test.title_1.apply(text_cleansing)
d_test['title_2_pre'] = d_test.title_2.apply(text_cleansing)

In [50]:
d_train.head()

Unnamed: 0,title_1,image_1,title_2,image_2,Label,title_1_pre,title_2_pre
0,johnson’s ® top to toe hair & body bath 500ml,fdff8b9b8229da091dd7d070aae05f81.jpg,johnson's cottontouch top to toe hair & body b...,41e191742760932598c7bd201e5dad47.jpg,0,johnson s top to toe hair body bath 500 ml,johnson s cottontouch top to toe hair body bab...
1,sandal humble,906cc44f0be72d4e767669b5b63e3a17.jpg,sandal humble glass - glanzton,7a556b836bfdd08ea592216440524a34.jpg,0,sandal humble,sandal humble glass glanzton
2,promo likuid likuit liquit baby pod liquid sal...,475c26635de18b9f93032400732ff336.jpg,voporizer liquit - likuit - likuid - liquid pr...,ace93bec689f3f1565800c500a8341fa.jpg,0,promo likuid likuit liquit baby pod liquid sal...,voporizer liquit likuit likuid liquid premium ...
3,6 pasang / set anting tusuk bentuk lingkaran a...,e630997f6217555d6026547ad1c15f0b.jpg,subei 6 pasang / set anting tusuk boho bohemia...,31abbc176b09f5bd1728cfc3ecbbfb9c.jpg,0,6 pasang set anting tusuk bentuk lingkaran aks...,subei 6 pasang set anting tusuk boho bohemia d...
4,rorec natural skin care mask rorec sheet mask ...,a27d11700a7902febd039dc3a96f10f2.jpg,rorec 86 natural skin care shert mask all variant,813ad9dd638c10f1765db9dde20c9e42.jpg,1,rorec natural skin care mask rorec sheet mask ...,rorec 86 natural skin care shert mask all variant


In [None]:
d_test.head()

In [None]:
d_train.to_csv("../data/text_clean/train.csv", index=False)
d_test.to_csv("../data/text_clean/test.csv", index=False)

## Overlapping

In [None]:
d_train['title_1_unique'] = d_train.title_1_pre.apply(lambda x: list(set(x.split())))
d_train['title_2_unique'] = d_train.title_2_pre.apply(lambda x: list(set(x.split())))

In [None]:
d_train['vocab'] = d_train.title_1_unique + d_train.title_2_unique

In [None]:
d_train['vocab'] = d_train.vocab.apply(lambda x: list(set(x)))

In [None]:
d_train['num_vocab'] = d_train.vocab.apply(lambda x: len(x))

In [None]:
def word_overlap(row):
    t1 = set(row['title_1_unique'])
    t2 = set(row['title_2_unique'])
    
    num_intersec = len(t1.intersection(t2))
    return num_intersec

In [None]:
d_train['num_overlap'] = d_train.apply(word_overlap, axis = 1)

In [None]:
d_train['percent_overlap'] = (d_train.num_overlap / d_train.num_vocab) * 100

In [None]:
plt.figure(figsize=(3,5))
sns.boxplot(x = 'Label', y='percent_overlap', data = d_train)
plt.savefig("../figures/word_overlap_boxplot.png")

In [None]:
d_train.loc[d_train['Label'] == 1, 'percent_overlap'].mean()

In [None]:
d_train.loc[d_train['Label'] == 0, 'percent_overlap'].mean()