In [1]:
#meta 4/21/2021 myClassify (multiclass) for LSHTC - Data prep
# task: classify text (multiclass) 
# input: my-input/lshtc3_wikipedia_med_df_train_multilabel.pkl

#task: Multi-category text classification - supervised
#based on aml_0_dataprep.ipynb
#      4/15/2021 DATA PREP 
#      LSHTC3 reduced ds with less labels
#      Important reset index - fast.ai dataloaders don't like a gap in index
#      to avoid -> TypeError: 'float' object is not iterable

#history
#4/21/2021 INPUT DF W/ MULTI-LABELS
#      multi-labels per document
#      requires pandas 1.1.0+ (myTrainBox env nlp_lshtc)

In [2]:
%matplotlib inline
#import time as time #to track performance time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,6)
from joblib import load, dump


# LSHTC: Data Prep for Classifying Text

## 0. Load Data
input: a prepared pickle with less labels (~).

In [3]:
df = load('data/lshtc3_wikipedia_med_df_train_multilabel.pkl') 
print(df.shape)
print(df.columns)
df.head()


(127043, 141)
Index(['labels', 'text', 'label_10088', 'label_10307', 'label_10590',
       'label_13139', 'label_14661', 'label_14902', 'label_16954',
       'label_21411',
       ...
       'label_389174', 'label_390846', 'label_395447', 'label_403289',
       'label_417577', 'label_427995', 'label_428198', 'label_429649',
       'label_437070', 'label_443106'],
      dtype='object', length=141)


Unnamed: 0_level_0,labels,text,label_10088,label_10307,label_10590,label_13139,label_14661,label_14902,label_16954,label_21411,...,label_389174,label_390846,label_395447,label_403289,label_417577,label_427995,label_428198,label_429649,label_437070,label_443106
docno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,[130762],Alain Connes (born 1 April 1947) is a French m...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[352578, 395447, 27512, 157031]",Ayn Rand (born Alisa Zinov'yevna Rosenbaum; Fe...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,"[390846, 395447, 276114]","Allan Dwan (April 3, 1885 – December 28, 1981)...",0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
5,"[14661, 71999, 292915, 188756, 131368]","Andre Kirk Agassi (born April 29, 1970) is an ...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"[106615, 228092]",Aldous Leonard Huxley (26 July 1894 \u2013 22 ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1. Data Prep
Tidy data - pick relevant columns

In [4]:
# find only target columns
y_cols = df.columns[df.columns.str.startswith('label_')]
y_cols

Index(['label_10088', 'label_10307', 'label_10590', 'label_13139',
       'label_14661', 'label_14902', 'label_16954', 'label_21411',
       'label_23214', 'label_27512',
       ...
       'label_389174', 'label_390846', 'label_395447', 'label_403289',
       'label_417577', 'label_427995', 'label_428198', 'label_429649',
       'label_437070', 'label_443106'],
      dtype='object', length=139)

In [5]:
df_tidy = df.drop('labels', axis = 1)
df_tidy[y_cols] = df_tidy[y_cols].astype('category')
print(df_tidy.shape)
df_tidy.dtypes


(127043, 140)


text              object
label_10088     category
label_10307     category
label_10590     category
label_13139     category
                  ...   
label_427995    category
label_428198    category
label_429649    category
label_437070    category
label_443106    category
Length: 140, dtype: object

ML Expected Format: input features X(matrix) and target variable y(also matrix).  
X - column 'text'  
y - columns with 'label_' 

In [6]:
#preview ready dataset
df_tidy.head()

Unnamed: 0_level_0,text,label_10088,label_10307,label_10590,label_13139,label_14661,label_14902,label_16954,label_21411,label_23214,...,label_389174,label_390846,label_395447,label_403289,label_417577,label_427995,label_428198,label_429649,label_437070,label_443106
docno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Alain Connes (born 1 April 1947) is a French m...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ayn Rand (born Alisa Zinov'yevna Rosenbaum; Fe...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,"Allan Dwan (April 3, 1885 – December 28, 1981)...",0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
5,"Andre Kirk Agassi (born April 29, 1970) is an ...",0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Aldous Leonard Huxley (26 July 1894 \u2013 22 ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#reset index - fast.ai dataloaders don't like a gap in index
#to avoid -> TypeError: 'float' object is not iterable
#$note: if need to preserve 'docno', set drop = False
df_tidy.reset_index(drop=True, inplace=True)
df_tidy.head(2)

Unnamed: 0,text,label_10088,label_10307,label_10590,label_13139,label_14661,label_14902,label_16954,label_21411,label_23214,...,label_389174,label_390846,label_395447,label_403289,label_417577,label_427995,label_428198,label_429649,label_437070,label_443106
0,Alain Connes (born 1 April 1947) is a French m...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Ayn Rand (born Alisa Zinov'yevna Rosenbaum; Fe...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [8]:
#save file prepped for sentiment analysis
dump(df_tidy, 'output/lshtc3_wikipedia_med_df_train_tidy_multilabel.pkl')

['output/lshtc3_wikipedia_med_df_train_tidy_multilabel.pkl']

In [9]:
mystop

NameError: name 'mystop' is not defined

##### Xtra

In [None]:
#clean df
df_tidy.rename_axis(index = None, inplace=True)
df_tidy.head(2)