# Market-1501 Train dataset

This notebook splits the Market1501 train dataset into train and validation

### Set up

#### 1. Set  up  accounts and role

In [None]:
import sagemaker
import boto3
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
sys.path.append('./src')


sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


#### 2. Configure train and validation datasets

In [None]:
bucket = sagemaker_session.default_bucket()
raw_bucket="<<bucketname>" # e.g. mybucket

#### S3 input data source

In [None]:
s3_train_raw = "s3://{}/merket1501/bounding_box_train/".format(raw_bucket)

#### S3 destination source
This is where the split train into train and validation dataset

In [None]:
s3_train="s3://{}/market1501/train3/".format(bucket)
s3_train_lst="s3://{}/market1501/train3_lst/".format(bucket)


s3_val="s3://{}/market1501/val3/".format(bucket)
s3_val_lst="s3://{}/market1501/val3_lst/".format(bucket)


s3_output_path= "s3://{}/market1501_output/".format(bucket)

## Split train test

In [None]:
temp_dir="/tmp/imageebedding"
train_raw_dir = os.path.join(temp_dir, "train_raw")
train_lst= os.path.join(temp_dir, "train_raw", "train.lst")
val_raw_dir = os.path.join(temp_dir, "val_raw")
val_lst= os.path.join(temp_dir, "val_raw", "val.lst")




In [None]:
!rm -rf $temp_dir 
!mkdir  -p $temp_dir 
!mkdir -p  $train_raw_dir
!mkdir -p  $val_raw_dir
!aws s3 sync $s3_train_raw $train_raw_dir --quiet

In [None]:
from datasets.market1501_dataset import Market1501Dataset

dataset = Market1501Dataset(train_raw_dir)

In [None]:
files = [os.path.join(train_raw_dir, f) for f in os.listdir(train_raw_dir) if f.endswith(".jpg")]

# The market 1501 dataset files have the naming convention target_camerasite_..., e.g. 1038_c2s2_131202_03.jpeg
target_raw_labels = [os.path.basename(f).split("_")[0] for f in files]
zero_indexed_labels_dict = {}
for rc in target_raw_labels:
    zero_indexed_labels_dict[rc] = zero_indexed_labels_dict.get(rc, len(zero_indexed_labels_dict))

target_zero_indexed_labels = [zero_indexed_labels_dict[l] for l in target_raw_labels]

In [None]:
#s3://sagemaker-us-east-2-324346001917/market1501/train/
len(zero_indexed_labels_dict)

In [None]:
from sklearn.model_selection import train_test_split

class_train, class_val = train_test_split( list(zero_indexed_labels_dict.values()),  test_size=0.20, random_state=42)

In [None]:
datatrain_x = [f for f,l in zip(files, target_zero_indexed_labels) if l in class_train]
datatrain_y = [l for f,l in zip(files, target_zero_indexed_labels) if l in class_train]


dataval_x = [f for f,l in zip(files, target_zero_indexed_labels) if l in class_val]
dataval_y = [l for f,l in zip(files, target_zero_indexed_labels) if l in class_val]

In [None]:
df_val = pd.DataFrame.from_records([(i,)for i in dataval_y])
df_val.columns=["target"]
df_train = pd.DataFrame.from_records([(i,)for i in datatrain_y])
df_train.columns=["target"]

In [None]:
df_val["target"].value_counts().plot.bar(figsize=(20,5))

In [None]:
df_val["target"].nunique()

In [None]:
df_train["target"].nunique()

In [None]:
df_train.shape

In [None]:
df_train["target"].describe()

In [None]:
import boto3
import os
from multiprocessing.dummy import Pool as ThreadPool

s3_client = boto3.client('s3')

def upload_files(files, s3_dest, num_threads=10 ):
    input_tuples = ( (f,  s3_dest) for f in files)

    with ThreadPool(num_threads) as pool:
        pool.starmap(upload_file, input_tuples)
   
    

def upload_file(f, s3_dest):
    fname=os.path.basename(f)
    prefix = "/".join( s3_dest.split("//")[1].split("/")[1:])
    key = "{}/{}".format(prefix.strip("/"), fname)
    bucket_d = s3_dest.split("//")[1].split("/")[0]
    s3_client.upload_file(f,   bucket_d, key)

In [None]:
%%time

upload_files(dataval_x, s3_val)

In [None]:
%%time

upload_files(datatrain_x, s3_train)

In [None]:
s3_train, s3_val