### This notebook is to create the Training Data Set, which is a subset of PREVIOUS and NEW datasets
#### Note:  All of below can be run locally (using mostly pandas)

In [1]:
import pandas as pd

In [2]:
PATH = "storage/sample/"

### Import the PREVIOUS corpus data file

In [3]:
df_PREV = pd.read_csv(f'{PATH}data_PREV.csv')
df_PREV.shape

(810165, 3)

### Import the NEW corpus data file
#### This will be used to compare against the PREVIOUS data file, and ensure inclusion in the new training set later on.

In [4]:
df_NEW = pd.read_csv(f'{PATH}data_NEW.csv')
df_NEW.shape

(865516, 3)

### Compare the Label Frequency counts for each of the datasets.

In [5]:
df_lbl_counts_P = df_PREV.groupby(['brand_id']).count().sort_values(by='text', ascending=False)
df_lbl_counts_N = df_NEW.groupby(['brand_id']).count().sort_values(by='text', ascending=False)

In [6]:
p, n = df_lbl_counts_P.shape[0], df_lbl_counts_N.shape[0]
print('PREVIOUS Unique Label counts: ' + str(p))
print('NEW Unique Label counts: ' + str(n))
print('Difference: ' + str(n-p))

PREVIOUS Unique Label counts: 26006
NEW Unique Label counts: 27328
Difference: 1322


### Get the PREVIOUS Record counts and remove records with low label frequency counts (< FREQ)

In [7]:
# Let's only consider labels that have occurred at least FREQ times:
FREQ = 2

items_to_drop = df_lbl_counts_P[df_lbl_counts_P.text < FREQ].index

# return rows that do NOT (~) include items_to_drop
df_P2 = df_PREV[~df_PREV['brand_id'].isin(items_to_drop)]

p, pa = df_PREV.count()[0], df_P2.count()[0]

print('PREVIOUS Record counts: ' + str(p))
print('PREVIOUS Record counts (Adjusted): ' + str(pa))
print('Difference after Removing: ' + str(pa-p))

PREVIOUS Record counts: 810165
PREVIOUS Record counts (Adjusted): 802557
Difference after Removing: -7608


### Get the NEW Record counts and remove records with low label frequency counts (< FREQ)

In [8]:
# Do same for NEW Dataset
items_to_drop = df_lbl_counts_N[df_lbl_counts_N.text < FREQ].index

# return rows that do NOT (~) include items_to_drop
df_N2 = df_NEW[~df_NEW['brand_id'].isin(items_to_drop)]

n,na = df_NEW.count()[0], df_N2.count()[0]

print('NEW Record counts: ' + str(n))
print('NEW Record counts (Adjusted): ' + str(na))
print('Difference after Removing: ' + str(na-n))

NEW Record counts: 865516
NEW Record counts (Adjusted): 857559
Difference after Removing: -7957


### Compare the (Unique) Label Frequency counts for each of the ADJUSTED datasets.

In [9]:
df_lbl_counts_P = df_P2.groupby(['brand_id']).count().sort_values(by='text', ascending=False)
df_lbl_counts_N = df_N2.groupby(['brand_id']).count().sort_values(by='text', ascending=False)

In [10]:
pp, nn = df_lbl_counts_P.shape[0], df_lbl_counts_N.shape[0]
print('PREVIOUS Unique Label counts: ' + str(pp))
print('NEW Unique Label counts: ' + str(nn))
print('Difference: ' + str(nn-pp))

PREVIOUS Unique Label counts: 18398
NEW Unique Label counts: 19371
Difference: 973


### Show the count of NEW Unique Labels that weren't there in the PREVIOUS dataset
#### We will make sure to include these new labels in our new training file later

In [11]:
# Compare labels in PREV dataset to labels in NEW dataset
p = set(df_lbl_counts_P.index)
n = set(df_lbl_counts_N.index)
if (n.issubset(p) == False) or (p.issuperset(n) == False):
    labels_unique = sorted(n.difference(p))
    print("There are " + str(len(labels_unique)) + " unique labels in NEW Dataset that are NOT found in PREV dataset.")
else:
    print("No unique labels in New Dataset")

There are 1056 unique labels in NEW Dataset that are NOT found in PREV dataset.


### Keep aside all records with NEW lables in the new training set
#### These we want as Must-Take in our new training dataset

In [21]:
df_must_take_TEMP = df_N2[df_N2['brand_id'].isin(labels_unique)]
df_must_take_TEMP.shape

(9079, 3)

### Take a sample of the Must-Take (New Labels) data and put it aside for the Holdout (Test) Dataset
#### (Since most of the classification predictions will probably come from this new label set in the future)

In [22]:
HOLDOUT_SIZE = 500

df_tst = df_must_take_TEMP.sample(HOLDOUT_SIZE, random_state = 42)
df_tst.shape

(500, 3)

In [23]:
# Write the holdout test data out to a file to be used another time
df_tst.to_csv(f'{PATH}holdout_for_Sampling_test.csv', index = False)

In [25]:
# Drop those holdout records from the training dataset
df_must_take = df_must_take_TEMP.drop(index=df_tst.index)
df_must_take.shape

(8579, 3)

### Determine which records are remaining (labels found in BOTH New and Previous datasets)

In [26]:
df_remaining = df_N2[~df_N2['brand_id'].isin(labels_unique)]
df_remaining.shape

(848480, 3)

#### TR_SIZE is desired size of Training dataset

In [27]:
TR_SIZE = 25000

# Must include the Must-Have labels (New to this dataset;  the remainder can be appended for training)
df_remaining = df_remaining.sample(TR_SIZE - (df_must_take.count()[0]), random_state = 42)
df_remaining.count()[0]

16421

### This is the new Sample dataset we will use for training
#### Append the Remaining with the Must-Have (New unique labels)

In [28]:
df_ts = df_must_take.append(df_remaining)
df_ts.shape

(25000, 3)

### Write the training data out to a file

In [29]:
df_ts.to_csv(f'{PATH}training_sample.csv', index = False)