### This is using FastAI v1 codebase

In [1]:
from fastai.text import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Begin reading in and prepping data

In [3]:
PATH = "data/sample/"

df_trn = pd.read_csv(f'{PATH}training_data.csv')

df_tst = pd.read_csv(f'{PATH}holdout_for_test.csv')
df_trn.shape, df_tst.shape

((25000, 3), (500, 3))

### Get a list of unique labels from both Train/Valid and Test datasets

In [4]:
# keep a list of unique lables that we can use later when testing
labels = df_trn['label_id'].unique()
labels.sort()

# Do the same for the Test (holdout) data
labels_tst = df_tst['label_id'].unique()
labels_tst.sort()

labels.shape, labels_tst.shape

((5316,), (382,))

### See if any Test labels are unique to Train/Valid labels

In [5]:
s = set(labels)
t = set(labels_tst)
if (t.issubset(s) == False) or (s.issuperset(t) == False):
    labels_tst_unique = sorted(t.difference(s))
    print("There are " + str(len(labels_tst_unique)) + " unique labels in Test that are NOT found in Train/Valid")
else:
    print("No unique Test labels")

There are 66 unique labels in Test that are NOT found in Train/Valid


### Display some of the Test Data which has Unique labels (not found in TRN Dataset)

In [None]:
pd.options.display.max_colwidth = 0
pd.options.display.html.use_mathjax = False
df_tst[df_tst['label_id'].isin(labels_tst_unique)]

## Begin Classification Predicition against Holdout (Test) Dataset

### Load the Fully-trained TC Learner that was originally saved with the last retraining with learner.export()

In [7]:
learn = load_learner(PATH, 'TC_export')

### Select range of rows to include from Test Dataset

In [12]:
R_START = 250
R_END = 500

### Iterate thru Fastai's learn.predict() method for each row in selected Test Dataset

In [13]:
pred_lst=[]
trg_lst=df_tst.iloc[R_START:R_END]['label_id']
for t in df_tst.iloc[R_START:R_END]['text']:
    pred = learn.predict(t)
    pred_lst.append(int(str(pred[0])))

### Determine which classification predictions were correct

In [14]:
b = [x==y for (x,y) in zip(list(trg_lst), pred_lst)]
res_lst = ['ok' if i==True else 'XXX' for i in b]
pct = res_lst.count('ok')/len(res_lst)

### Display results

In [11]:
d={'Target': trg_lst, 'Prediction': pred_lst, 'Result':res_lst}
pd.DataFrame(data=d)

Unnamed: 0,Target,Prediction,Result
0,6876,9036,XXX
1,25031,1783,XXX
2,4481,15348,XXX
3,42481,42481,ok
4,5144,5144,ok
5,44,44,ok
6,42757,42757,ok
7,13650,13650,ok
8,42529,42529,ok
9,43986,43986,ok


In [11]:
print('Accuracy: ' + str(pct))

Accuracy: 0.78


# Section Below can be used for Manual Testing

### Use below if you want to RANDOMLY choose text from the Test Dataset to classify

In [None]:
pd.options.display.max_colwidth = 0
pd.options.display.html.use_mathjax = False

LBL_ID = np.random.choice(np.array(labels_tst),1,False)[0]
    
t = str(df_tst[df_tst['label_id']==LBL_ID].iloc[0].text)

print(LBL_ID, t)

In [45]:
learn.predict(t)

(Category 831,
 tensor(264),
 tensor([8.8164e-09, 1.1244e-07, 3.7135e-06,  ..., 5.3147e-07, 9.4112e-06,
         5.2382e-06]))

### Use below if you want to choose a SPECIFIC label of the Test Dataset to classify

In [None]:
LBL_ID = 28370
t = str(df_trn[df_trn['label_id']==LBL_ID].text)
print(LBL_ID, t)

In [48]:
learn.predict(t)

(Category 28370,
 tensor(1665),
 tensor([2.0339e-08, 2.4228e-06, 3.1566e-05,  ..., 4.2645e-06, 9.2083e-06,
         2.3035e-05]))

### Use below if you want to supply YOUR OWN TEST STRING to run thru the classifier:

In [40]:
t="We canceled all the fraudulent orders that were made on your computer and refunded the money where possible.""

In [41]:
learn.predict(t)

(Category 2331,
 tensor(512),
 tensor([7.3616e-06, 3.4632e-09, 3.3857e-05,  ..., 3.1489e-09, 4.5093e-08,
         6.3345e-10]))