In [1]:
# import libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# read the data
train=pd.read_csv('train0.csv')
test=pd.read_csv('test0.csv')

In [3]:
# remove spaces from the data
train=train.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
test=test.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [4]:
# replace missing values with np.nan
train=train.replace("nan",np.nan)
train=train.replace("?",np.nan)
train=train.replace("None",np.nan)

test=test.replace("nan",np.nan)
test=test.replace("?",np.nan)
test=test.replace("None",np.nan)

In [5]:
# Converting 'Revenue', 'Net_Valuation', 'Share_Price' to numeric values
train['Revenue']=pd.to_numeric(train['Revenue'],errors='coerce')
train['Net_Valuation']=pd.to_numeric(train['Net_Valuation'],errors='coerce')
train['Share_Price']=pd.to_numeric(train['Share_Price'],errors='coerce')

test['Revenue']=pd.to_numeric(test['Revenue'],errors='coerce')
test['Net_Valuation']=pd.to_numeric(test['Net_Valuation'],errors='coerce')
test['Share_Price']=pd.to_numeric(test['Share_Price'],errors='coerce')


In [6]:
train.columns

Index(['File_Name', 'Location', 'Sector', 'Employees', 'Revenue',
       'Net_Valuation', 'Share_Price', 'Company_Background', 'Product'],
      dtype='object')

In [7]:
train.Product.unique()

array(['AI', 'Truck', 'Medical devices', 'antivirus-security', 'Hydro',
       'Organicfood', 'charcoal', 'Kids toys', 'Fuel', 'Gaming hardware',
       'Bike', 'allopathy', 'Solar'], dtype=object)

In [8]:
# # drop File_Name
# train.drop('File_Name',axis=1,inplace=True)

# File_Name = test['File_Name']
# test.drop('File_Name',axis=1,inplace=True)

In [9]:
# unique values in Location
train.Location.append(test.Location).unique().shape

(232,)

In [10]:
# unique values in Sector
train.Sector.append(test.Sector).unique()

array(['Public_sector', nan, 'Private_sector'], dtype=object)

In [11]:
# replace underscore from Sector with space
train.Sector=train.Sector.replace('Public_sector','Public sector')
test.Sector=test.Sector.replace('Public_sector','Public sector')

train.Sector=train.Sector.replace('Private_sector','Private sector')
test.Sector=test.Sector.replace('Private_sector','Private sector')

In [12]:
# fill missing values in Company_Background with empty string
train.Company_Background.fillna('',inplace=True)
test.Company_Background.fillna('',inplace=True)

In [13]:
# concatenate Company_Background with all other columns

# location
idx = train.Location[train.Location.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company is based out of ' + train.loc[idx,'Location'] + '.'
# Sector
idx = train.Sector[train.Sector.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company is in the ' + train.loc[idx,'Sector'] + '.'
# Employees
idx = train.Employees[train.Employees.notnull()].index
train.Employees = train.Employees.fillna('0')
train['Employees']=train['Employees'].astype(float).astype(int)
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has ' + train.loc[idx,'Employees'].astype(str) + '.' + ' employees.'
# Revenue
idx = train.Revenue[train.Revenue.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has a revenue of ' + train.loc[idx,'Revenue'].round(1).astype(str) + ' Milions.'
# Net_Valuation
idx = train.Net_Valuation[train.Net_Valuation.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has a net valuation of ' + train.loc[idx,'Net_Valuation'].round(1).astype(str) + '.'
# Share_Price
idx = train.Share_Price[train.Share_Price.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has a share price of ' + train.loc[idx,'Share_Price'].round(1).astype(str) + '.'

# Test data

# location
idx = test.Location[test.Location.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company is based out of ' + test.loc[idx,'Location'] + '.'
# Sector
idx = test.Sector[test.Sector.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company is in the ' + test.loc[idx,'Sector'] + '.'
# Employees
idx = test.Employees[test.Employees.notnull()].index
test.Employees = test.Employees.fillna('0')
test['Employees']=test['Employees'].astype(float).astype(int)
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has ' + test.loc[idx,'Employees'].astype(str) + ' employees.'
# Revenue
idx = test.Revenue[test.Revenue.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has a revenue of ' + test.loc[idx,'Revenue'].round(1).astype(str) + ' Milions.'
# Net_Valuation
idx = test.Net_Valuation[test.Net_Valuation.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has a net valuation of ' + test.loc[idx,'Net_Valuation'].round(1).astype(str) + '.'
# Share_Price
idx = test.Share_Price[test.Share_Price.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has a share price of ' + test.loc[idx,'Share_Price'].round(1).astype(str) + '.'


In [14]:
train[train.Company_Background==''].shape

(0, 9)

In [15]:
# drop unnecessary columns from train and test
train = train.drop(['Location','Sector','Employees','Revenue','Net_Valuation','Share_Price'],axis=1)
test = test.drop(['Location','Sector','Employees','Revenue','Net_Valuation','Share_Price'],axis=1)

In [20]:
test.Company_Background[0] = 'This Company has a revenue of 139.3 Milions. This Company has a net valuation of 27869.4. This Company has a share price of 214428.8.'

In [23]:
# save preprocessed data
train.to_csv('train1.csv',index=False)
test.to_csv('test1.csv',index=False)

In [24]:
# load preprocessed data
train = pd.read_csv('train1.csv')
test = pd.read_csv('test1.csv')

In [25]:
display(train.head())
display(test.head())

Unnamed: 0,File_Name,Company_Background,Product
0,PROJ0x130f.html,More complex computations may involve many ope...,AI
1,PROJ0x61c1.html,It would have had two super heavy variants: on...,Truck
2,PROJ0x520d.html,"The standard ISO 26262, is considered as one o...",Truck
3,PROJ0x1d31.html,Diana Zuckerman published in the peer-reviewed...,Medical devices
4,PROJ0x77de.html,"From this perspective, security and insecurity...",antivirus-security


Unnamed: 0,File_Name,Company_Background
0,PROJ0x1016.html,This Company has a revenue of 139.3 Milions. T...
1,PROJ0x1024.html,We shall call it information technology (IT). ...
2,PROJ0x102f.html,MAN SE holds a 17.All of the above light comme...
3,PROJ0x1033.html,"It is not only the ""cause of knowledge and tru..."
4,PROJ0x1034.html,"Although industry restructuring proceeded, the..."


In [26]:
# drop File_Name
train.drop('File_Name',axis=1,inplace=True)

File_Name = test['File_Name']
test.drop('File_Name',axis=1,inplace=True)

In [27]:
# Change categorical data to numerical data
train.replace({'Product' : { 'AI' : 0, 'Truck' : 1, 'Medical devices' : 2, 'antivirus-security' : 3, 'Hydro' : 4,
         'Organicfood' : 5, 'charcoal' : 6, 'Kids toys' : 7, 'Fuel' : 8, 'Gaming hardware' : 9,
            'Bike' : 10, 'allopathy' : 11, 'Solar' : 12}},inplace=True)

In [28]:
# import transformers and tokenizers
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [30]:
# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [31]:
# zero array of shape (len(train),256)
X_input_ids = np.zeros((len(train), 256))
X_attn_masks = np.zeros((len(train), 256))

In [32]:
def generate_training_data(train, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(train['Company_Background'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [33]:
# generate input ids and attention masks
X_input_ids, X_attn_masks = generate_training_data(train, X_input_ids, X_attn_masks, tokenizer)


0it [00:00, ?it/s]

In [34]:
# Create a zeor array for label 
labels = np.zeros((len(train), 13))
labels.shape

(13452, 13)

In [35]:
# one hot encoding of labels
labels[np.arange(len(train)), train['Product'].values] = 1

In [36]:
# creating a data pipeline using tensorflow dataset utility
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [37]:
def datasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [38]:
# creating dataset with maping input ids, attention masks and labels
dataset = dataset.map(datasetMapFunction)

# split dataset into train and validation
dataset = dataset.shuffle(1000).batch(16, drop_remainder=True)

p = 0.8
train_size = int((len(train)//16)*p)

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [39]:
# import BERt model
from transformers import TFBertModel

In [40]:
# load bert base model
model = TFBertModel.from_pretrained('bert-base-cased') 

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [41]:
# create a model
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(13, activation='softmax', name='output_layer')(intermediate_layer)

model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [42]:
# compile model with optimizer, loss and metrics
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [43]:
# train model
history = model.fit( train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5


In [None]:
# save model weights
model.save('product_class_model')

In [None]:
# load model weights
model = tf.keras.models.load_model('product_class_model')

In [None]:
# prepare test data
X_input_ids = np.zeros((len(test), 256))
X_attn_masks = np.zeros((len(test), 256))

X_input_ids, X_attn_masks = generate_training_data(test, X_input_ids, X_attn_masks, tokenizer)

# create a data pipeline for test data
test_dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks))

def datasetMapFunction(input_ids, attn_masks):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }

# create dataset with maping input ids and attention masks
test_dataset = test_dataset.map(datasetMapFunction)

In [None]:
# predict on test data
pred = model.predict(test_dataset)

In [None]:
# one hot to categorical
classes = ['AI', 'Truck', 'Medical devices', 'antivirus-security', 'Hydro',
            'Organicfood', 'charcoal', 'Kids toys', 'Fuel', 'Gaming hardware',
            'Bike', 'allopathy', 'Solar']

pred = np.argmax(pred, axis=1)

In [None]:
# create submission file
submission = pd.read_csv('sample_submission.csv')
submission['File_Name'] = File_Name
submission['Product'] = pred
submission['Product'] = submission['Product'].apply(lambda x: classes[x])
submission.to_csv('submission.csv', index=False)