# Development set split
In principle, we can our augmented dataset of 10,000+ images for training. If we want to do any refinement of hyperparameters, we can split our training set into
* training 80% of each class
* dev 20 % of each class

In [9]:
# Requirements
import pandas as pd
import numpy as np

In [10]:
# load dataset

augmented_df = pd.read_csv('D:/mapmyindia-master/mapmyindia-master/data/train/augmented/train_augmented.csv')
augmented_df.head()

Unnamed: 0,Img_Name,Label
0,01-05 10.15.27_2_0000.jpg,Speed Limit 60
1,01-05 10.15.27_2_1621.jpg,Speed Limit 60
2,01-05 10.15.27_2_1729.jpg,Speed Limit 60
3,01-05 10.15.27_2_1809.jpg,Speed Limit 60
4,01-05 10.15.27_2_1981.jpg,Speed Limit 60


### Define 80/20 split

In [11]:
# define split function
def split_to_train_test(df, label_column, train_frac=0.8):
    train_df, test_df = pd.DataFrame(), pd.DataFrame()
    labels = df[label_column].unique()
    for lbl in labels:
        lbl_df = df[df[label_column] == lbl]
        lbl_train_df = lbl_df.sample(frac=train_frac)
        lbl_test_df = lbl_df.drop(lbl_train_df.index)
        train_df = train_df.append(lbl_train_df)
        test_df = test_df.append(lbl_test_df)

    return train_df, test_df

### Apply split

In [12]:
# perform split
train, test = split_to_train_test(augmented_df, 'Label', 0.8)

### Double-check split

In [13]:
train.info()
train.groupby('Label').count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19023 entries, 22947 to 20674
Data columns (total 2 columns):
Img_Name    19023 non-null object
Label       19023 non-null object
dtypes: object(2)
memory usage: 445.9+ KB


Unnamed: 0_level_0,Img_Name
Label,Unnamed: 1_level_1
Speed Limit 20,2641
Speed Limit 30,3256
Speed Limit 40,4438
Speed Limit 50,4900
Speed Limit 60,1791
Speed Limit 80,1997


In [14]:
test.info()
test.groupby('Label').count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4755 entries, 11 to 20975
Data columns (total 2 columns):
Img_Name    4755 non-null object
Label       4755 non-null object
dtypes: object(2)
memory usage: 111.4+ KB


Unnamed: 0_level_0,Img_Name
Label,Unnamed: 1_level_1
Speed Limit 20,660
Speed Limit 30,814
Speed Limit 40,1109
Speed Limit 50,1225
Speed Limit 60,448
Speed Limit 80,499


### Save CSV files

In [15]:
test.to_csv("D:/mapmyindia-master/mapmyindia-master/data/train/augmented/split_20_dev.csv", index=False)
train.to_csv("D:/mapmyindia-master/mapmyindia-master/data/train/augmented/split_80_train.csv", index=False)