# Text Processing for Accelerator project

A simplified pipeline processing text with FastText.

* Load CPA data
* Basic text cleaning
* Vectorize (with FastText)
* Reduce dimension using UMAP, both supervised and unsupervised
* Predict unclassified data

In [117]:
# this bit shouldn't be necessary if we pip install -e .   in the parent directory
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [102]:
import functools
from pprint import pprint
from time import time
from IPython.display import display, HTML
import logging
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
import umap

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express

pd.set_option('display.max_colwidth', None)

In [103]:
import text_processing


#### Load in the CPA data

In [4]:
CPA = text_processing.fetch_files()

2020-11-26 13:02:00,055 - text_processing - INFO - cleanded CPA File imported


In [53]:
CPA[CPA.Level.isin({2,3,4,5,6})].head(6)

Unnamed: 0,Order,Level,Code,Parent,Descr_old,Descr,Includes,Category_2,Category_1,Category_0
1,1208793,2,01,A,"Products of agriculture, hunting and related services","Products of agriculture, hunting and related services",,1,A,1
2,1208794,3,01.1,01,Non-perennial crops,Non-perennial crops,,1,A,1
3,1208795,4,01.11,01.1,"Cereals (except rice), leguminous crops and oil seeds","Cereals , leguminous crops and oil seeds","cereals, leguminous crops and oil seeds grown for food and other purposes",1,A,1
4,1208796,5,01.11.1,01.11,Wheat,Wheat,,1,A,1
5,1208797,6,01.11.11,01.11.1,Durum wheat,Durum wheat,,1,A,1
6,1208798,6,01.11.12,01.11.1,"Wheat, except durum wheat",Wheat,,1,A,1


In [104]:
CPA1 = CPA[CPA.Level==6][['Code','Descr_old','Descr','Category_0','Category_1','Category_2']].copy()
df = text_processing.clean_col(CPA1, "Descr")
df.drop('Descr',axis=1,inplace=True)
df.sample(5)

2020-11-26 17:03:43,339 - text_processing - INFO - Cleaning column: Descr 
2020-11-26 17:03:43,339 - text_processing - INFO - Cleaning column: Descr 
2020-11-26 17:03:43,339 - text_processing - INFO - Cleaning column: Descr 
2020-11-26 17:03:43,339 - text_processing - INFO - Cleaning column: Descr 
2020-11-26 17:03:43,339 - text_processing - INFO - Cleaning column: Descr 
2020-11-26 17:03:43,339 - text_processing - INFO - Cleaning column: Descr 
2020-11-26 17:03:43,339 - text_processing - INFO - Cleaning column: Descr 


Unnamed: 0,Code,Descr_old,Category_0,Category_1,Category_2,Descr_cleaned
3980,47.00.67,Retail trade services of games and toys,4,G,47,retail trade services games toys
240,01.49.11,"Domestic rabbits, live",1,A,1,domestic rabbits live
3666,45.31.12,Wholesale trade services of other motor vehicle parts and accessories,4,G,45,wholesale trade services motor vehicle parts accessories
1046,15.11.22,Patent leather and patent laminated leather; metallised leather,2,C,15,patent leather patent laminated leather metallised leather
2618,28.14.13,"Process control valves, gate valves, globe valves and other valves",2,C,28,process control valves gate valves globe valves valves


## Load FastText Pretrained

Note: This requires a fair bit of memory (peaks at about 17.5 GiB)

Recommend shutting down other kernels first, once this has loaded memory usage drops again.

This takes a few minutes to load in.

In [8]:
wv = text_processing.fetch_fasstext_pretrained(filepath="../../data/wiki.en.bin")

2020-11-26 13:02:11,119 - text_processing - INFO - Loading FastText pretrained from ../../data/wiki.en.bin
2020-11-26 13:05:51,347 - text_processing - INFO - Model loaded


### Vectorize CPA data using FastText

In [106]:
text_to_vec = functools.partial(text_processing.vectorize_text, wv)
df["Descr_cleaned_vectorized"] = df.Descr_cleaned.apply(
    text_to_vec
)

## Dimensionality Reduction using UMAP

### Unsupervised dimension reduction

In [20]:
# df["Descr_cleaned_vectorized_low_dimension"] = text_processing.reduce_dimensionality(df.Descr_cleaned_vectorized)

### Dimensionality Reduction using UMAP supervised

In [130]:
df['Cat'] = df.Category_2.astype(int)
df["Descr_cleaned_vectorized_low_dimension"] = text_processing.reduce_dimensionality_supervised(df.Descr_cleaned_vectorized, df.Cat)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:16:10,869 - text_processing - INFO - Now a

UMAP(min_dist=0.0, n_components=10, n_neighbors=10, random_state=3052528580,
     verbose=10)
Construct fuzzy simplicial set
Thu Nov 26 17:16:16 2020 Finding Nearest Neighbors
Thu Nov 26 17:16:17 2020 Finished Nearest Neighbor Search
Thu Nov 26 17:16:17 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Nov 26 17:16:27 2020 Finished embedding


# Use the Random Forest Classifier

https://www.datacamp.com/community/tutorials/random-forests-classifier-python

We have already split our data into training and test datasets and used UMAP supervised classification to reduce the dimension on the training set.   
We then used UMAP prediction to reduce the dimension on the test set.

We now use the random forest classifier on the training set, and see how it works on the test set.


In [131]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

X_data = np.array(list(df["Descr_cleaned_vectorized_low_dimension"].values))
y_data = np.array(list(df.Cat.values))

# Split dataset into training set and test set

train_set, test_set = train_test_split(df.copy(), test_size=0.2, random_state=42)

#Train the model using the training sets 
X_train = np.array(list(train_set["Descr_cleaned_vectorized_low_dimension"].values))
y_train = np.array(list(train_set.Cat.values))

X_test = np.array(list(test_set["Descr_cleaned_vectorized_low_dimension"].values))
y_test = np.array(list(test_set.Cat.values))

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)


#### Accuracy

In [132]:
#Import scikit-learn metrics module for accuracy calculation
y_test = np.array(list(test_set.Cat.values))
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.9440993788819876


In [133]:
# get a dataframe of the level 2 descriptions
Cat_2_descr = text_processing.get_category_description(CPA, 2)
Cat_2_descr.sample(5)

Unnamed: 0,Category_2,Category_2_descr
307,3,Fish and other fishing products; aquaculture products; support services to fishing
4252,58,Publishing services
4924,75,Veterinary services
948,14,Wearing apparel
5043,80,Security and investigation services


In [134]:

print(len(y_pred), len(y_test))
test_set['Predicted'] = pd.Series(data=y_pred.tolist(), index=test_set.index)
non_matches0 = test_set[test_set.Predicted != test_set.Cat].drop('Descr_cleaned_vectorized',axis=1)
non_matches1 = non_matches0.merge(Cat_2_descr, on='Category_2', how='left')
non_matches = non_matches1.merge(Cat_2_descr.astype({'Category_2':int}).rename(columns={'Category_2_descr':'Prediced_Cat_descr', 'Category_2':'Predicted'}),
                                 on='Predicted', how='left')


644 644


In [135]:
non_matches.sample(5)

Unnamed: 0,Code,Descr_old,Category_0,Category_1,Category_2,Descr_cleaned,Cat,Descr_cleaned_vectorized_low_dimension,Predicted,Category_2_descr,Prediced_Cat_descr
11,25.29.12,"Containers for compressed or liquefied gas, of metal",2,C,25,containers compressed liquefied gas metal,25,"[-0.8115108609199524, -1.430631399154663, 8.333356857299805, 6.275633335113525, 3.0283453464508057, 10.21987247467041, 6.265014171600342, 11.358818054199219, 6.2893805503845215, 3.3160688877105713]",27,"Fabricated metal products, except machinery and equipment",Electrical equipment
12,82.91.12,Collection agency services,8,N,82,collection agency services,82,"[1.7448574304580688, -0.2587381899356842, 9.817572593688965, -1.3335999250411987, 0.1072535365819931, 7.5663743019104, 10.364123344421387, 4.88882303237915, 8.78766918182373, 5.37332010269165]",74,"Office administrative, office support and other business support services","Other professional, scientific and technical services"
4,38.11.53,Used pneumatic tyres of rubber,2,E,38,used pneumatic tyres rubber,38,"[2.125735282897949, 0.18328115344047546, 11.344267845153809, 4.874699115753174, 7.469050407409668, 8.979768753051758, 3.2050251960754395, 9.698098182678223, 7.124595642089844, 5.445359230041504]",22,"Waste collection, treatment and disposal services; materials recovery services",Rubber and plastic products
29,49.42.19,Other removal services,4,H,49,removal services,49,"[0.9574466347694397, -0.08783234655857086, 10.253785133361816, -0.6963055729866028, 0.2968844771385193, 7.591365814208984, 8.62102222442627, 4.420943260192871, 9.097941398620605, 4.799103260040283]",37,Land transport services and transport services via pipelines,Sewerage services; sewage sludge
28,02.30.30,"Parts of plants, grasses, mosses and lichens suitable for ornamental purposes",1,A,2,parts plants grasses mosses lichens suitable ornamental purposes,2,"[0.5206250548362732, 1.4218617677688599, 9.28674030303955, 12.447073936462402, 0.7991464734077454, 6.10408353805542, 10.701151847839355, 1.7812113761901855, 9.422142028808594, 7.952404499053955]",1,"Products of forestry, logging and related services","Products of agriculture, hunting and related services"


In [145]:
CN = text_processing.fetch_CN_files()
CN.sample(3)

2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files imported and cleaned
2020-11-26 17:21:39,206 - text_processing - INFO - CN new Files i

Unnamed: 0,CN_Code,CPA_Code,CN_Section,CN_Level,Excl_removed,CN_Description_cleaned,Category_2
12199,81129289,24.45.30,8112 92 89,10,,Unwrought gallium; gallium powders,24
6362,38249945,20.59.59,3824 99 45,10,,Anti-scaling and similar compounds,20
2265,12129200,01.25.90,1212 92,7,not ground,"Locust beans ""carob"", fresh, chilled, frozen or dried, whether or",1


In [146]:
CN["Descr_cleaned_vectorized"] = CN.CN_Description_cleaned.apply(
    text_to_vec
)

CN["Descr_cleaned_vectorized_low_dimension"] = text_processing.train_predict_umap(
    train_set.Descr_cleaned_vectorized, train_set.Cat, CN.Descr_cleaned_vectorized)

#CN["Descr_cleaned_vectorized_low_dimension"] = text_processing.reduce_dimensionality(CN.Descr_cleaned_vectorized)

2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,731 - text_processing - INFO - Now applying umap to reduce dimension
2020-11-26 17:21:48,7

UMAP(min_dist=0.0, n_components=10, n_neighbors=10, random_state=3052528580,
     verbose=10)
Construct fuzzy simplicial set
Thu Nov 26 17:21:52 2020 Finding Nearest Neighbors
Thu Nov 26 17:21:52 2020 Finished Nearest Neighbor Search
Thu Nov 26 17:21:52 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Nov 26 17:22:00 2020 Finished embedding
	completed  0  /  30 epochs
	completed  3  /  30 epochs
	completed  6  /  30 epochs
	completed  9  /  30 epochs
	completed  12  /  30 epochs
	completed  15  /  30 epochs
	completed  18  /  30 epochs
	completed  21  /  30 epochs
	completed  24  /  30 epochs
	completed  27  /  30 epochs


In [149]:
CN_test_df = CN[(CN.Category_2.notnull()) & (CN.CN_Level==4)].drop('Descr_cleaned_vectorized',axis=1)
CN_test = np.array(list(CN_test_df["Descr_cleaned_vectorized_low_dimension"].values))
#y_test = np.array(list(test_set.Cat.values))
y_CN_pred=clf.predict(CN_test)

#Import scikit-learn metrics module for accuracy calculation
y_CN_test = np.array(list(CN_test_df.Category_2.astype(int)))
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_CN_test, y_CN_pred))

CN_test_df['Predicted'] = pd.Series(data=y_CN_pred.tolist(), index=CN_test_df.index)
CN_test_df.sample(10)

Accuracy: 0.09392265193370165


Unnamed: 0,CN_Code,CPA_Code,CN_Section,CN_Level,Excl_removed,CN_Description_cleaned,Category_2,Descr_cleaned_vectorized_low_dimension,Predicted
10403,69060000,23.32.13,6906,4,"(excl. of siliceous fossil meals or similar siliceous earths, refractory ceramic goods, chimney liners, pipes specifically manufactured for laboratories, insulating tubing and fittings and other piping for electrotechnical purposes)","Ceramic pipes, conduits, guttering and pipe fittings",23,"[-0.8890625834465027, -0.1693418174982071, 2.164217710494995, 7.909590244293213, 1.73959481716156, 3.0076496601104736, 10.418719291687012, 6.70717716217041, 13.447763442993164, 2.4053380489349365]",3
12075,80030000,24.43.24,8003,4,,"Tin bars, rods, profiles and wire, n.e.s.",24,"[-0.6886611580848694, 0.06508789211511612, 1.5180352926254272, 8.117136001586914, 2.4776270389556885, 2.8708581924438477, 9.843130111694336, 6.500426292419434, 12.549605369567871, 2.54296612739563]",3
6285,38220000,20.59.52,3822,4,"(excl. compound diagnostic reagents designed to be administered to the patient, blood-grouping reagents, animal blood prepared for therapeutic, prophylactic or diagnostic uses and vaccines, toxins, cultures of micro-organisms and similar products) not on a backing, and certified reference materials","Diagnostic or laboratory reagents on a backing, prepared diagnostic or laboratory reagents whether or",20,"[-0.44529882073402405, 0.13770945370197296, 0.12416297197341919, 8.576146125793457, 4.604890823364258, -0.5970372557640076, 5.340291500091553, 10.21834659576416, 8.548567771911621, 1.8389836549758911]",13
4118,27150000,23.99.13,2715,4,,"Bituminous mastics, cut-backs and other bituminous mixtures based on natural asphalt, on natural bitumen, on petroleum bitumen, on mineral tar or on mineral tar pitch",23,"[0.6560388803482056, 0.6665619611740112, 0.8140869736671448, 8.240727424621582, 4.632938385009766, 2.58549427986145, 7.72888708114624, 6.040282726287842, 9.439143180847168, 2.4615745544433594]",13
11831,74130000,25.93.12,7413,4,(excl. electrically insulated products),"Stranded wire, cables, plaited bands and the like, of copper",25,"[-0.2675167918205261, 0.4591939449310303, 0.2964555323123932, 8.457222938537598, 4.491921901702881, 2.7510898113250732, 8.40706729888916, 5.544773578643799, 9.628931999206543, 2.6415717601776123]",13
3883,26100000,07.29.19,2610,4,,Chromium ores and concentrates,7,"[-1.4609551429748535, 1.3051457405090332, 2.4970250129699707, 7.5469279289245605, 2.3846802711486816, 1.3388255834579468, 10.09211540222168, 10.378777503967285, 14.378164291381836, 4.065723419189453]",26
3867,26050000,07.29.19,2605,4,,Cobalt ores and concentrates,7,"[-1.4616997241973877, 1.3068289756774902, 2.4982056617736816, 7.54635763168335, 2.3854737281799316, 1.3362014293670654, 10.089130401611328, 10.381552696228027, 14.376641273498535, 4.067243576049805]",26
15424,90200000,32.99.59,9020,4,"(excl. protective masks having neither mechanical parts nor replaceable filters, and artificial respiration or other therapeutic respiration apparatus)",Breathing appliances and gas masks,32,"[-0.21305778622627258, 0.3645002841949463, -0.03430357575416565, 8.305000305175781, 3.178992748260498, 2.445214033126831, 7.956547737121582, 7.218453884124756, 11.039538383483887, 2.132152557373047]",3
10201,65070000,14.19.43,6507,4,"(excl. headbands used by sportsmen as sweatbands, knitted or crocheted)","Headbands, linings, covers, hat foundations, hat frames, peaks and chinstraps, for headgear",14,"[-0.5008111596107483, 0.6202476620674133, -0.23367127776145935, 8.453653335571289, 4.462147235870361, 2.4784324169158936, 8.892607688903809, 3.99592924118042, 8.160361289978027, 2.6944704055786133]",13
3880,26090000,07.29.15,2609,4,,Tin ores and concentrates,7,"[-0.8000733852386475, 0.4708726406097412, 1.6032538414001465, 7.985667705535889, 2.6121397018432617, 1.9071203470230103, 9.081939697265625, 8.136235237121582, 13.468693733215332, 2.656273603439331]",3


In [150]:
sample = CN_test_df.sample(5)
sample_test = np.array(list(sample["Descr_cleaned_vectorized_low_dimension"].values))
display(sample)
clf.predict(sample_test)

Unnamed: 0,CN_Code,CPA_Code,CN_Section,CN_Level,Excl_removed,CN_Description_cleaned,Category_2,Descr_cleaned_vectorized_low_dimension,Predicted
8949,56050000,13.96.11,5605,4,"(excl. yarns manufactured from a mixture of textile fibres and metal fibres, with anti-static properties; yarns reinforced with metal wire; articles with the character of trimmings), whether or not gimped,","Metallised yarn being textile yarn, or strip or the like of heading 5404 or 5405, of textile fibres, combined with metal in the form of thread, strip or powder or covered with metal",13,"[-0.33936792612075806, 0.5814311504364014, 0.11199671775102615, 8.453661918640137, 4.514398574829102, 2.82140851020813, 8.336427688598633, 5.3873186111450195, 9.444144248962402, 2.6217498779296875]",13
10733,71090000,24.41.40,7109,4,not further worked than semi-manufactured,"Base metals or silver, clad with gold,",24,"[-0.5094356536865234, -0.07921215146780014, 1.7569701671600342, 8.27991771697998, 2.158804178237915, 3.015340805053711, 10.629666328430176, 7.156980991363525, 14.166692733764648, 2.7605931758880615]",26
4118,27150000,23.99.13,2715,4,,"Bituminous mastics, cut-backs and other bituminous mixtures based on natural asphalt, on natural bitumen, on petroleum bitumen, on mineral tar or on mineral tar pitch",23,"[0.6560388803482056, 0.6665619611740112, 0.8140869736671448, 8.240727424621582, 4.632938385009766, 2.58549427986145, 7.72888708114624, 6.040282726287842, 9.439143180847168, 2.4615745544433594]",13
6835,40070000,22.19.20,4007,4,"(excl. ungimped single thread with a diameter of > 5 mm and textiles combined with rubber thread, e.g. textile-covered thread and cord)",Vulcanised rubber thread and cord,22,"[-0.9910297393798828, 0.5775982141494751, -0.6554473638534546, 8.50863265991211, 5.113471984863281, 3.0270373821258545, 8.389033317565918, 5.005032539367676, 8.77334976196289, 2.6810030937194824]",13
4571,28470000,20.13.63,2847,4,not solidified with urea,"Hydrogen peroxide, whether or",20,"[-0.9621415734291077, -0.5279266834259033, 0.7720573544502258, 8.87329387664795, 5.897768974304199, -2.553973913192749, 4.925012111663818, 15.024232864379883, 6.830611228942871, 0.5037585496902466]",35


array([13, 26, 13, 13, 35])

In [151]:
Cat_2_descr.head(30)

Unnamed: 0,Category_2,Category_2_descr
1,1,"Products of agriculture, hunting and related services"
274,2,"Products of forestry, logging and related services"
307,3,Fish and other fishing products; aquaculture products; support services to fishing
345,5,Coal and lignite
354,6,Crude petroleum and natural gas
365,7,Metal ores
382,8,Other mining and quarrying products
421,9,Mining support services
434,10,Food products
759,11,Beverages
