<a href="https://colab.research.google.com/github/ZikryRamadhan/Bangkit2021_Pafin/blob/main/Capstone_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

## Getting the dataset

Dataset available from kaggle (https://www.kaggle.com/charanpuvvala/company-classification)

Upload kaggle.json first (open kaggle, account -> create new API token)

In [2]:
! chmod 600 kaggle.json && (ls ~/.kaggle 2>/dev/null || mkdir ~/.kaggle) && mv kaggle.json ~/.kaggle/ && echo 'Done'

chmod: cannot access 'kaggle.json': No such file or directory


In [3]:
! kaggle datasets download charanpuvvala/company-classification
! ls

company-classification.zip: Skipping, found more recently modified local copy (use --force to force download)
company-classification.zip  sample_data


## Preprocessing dataset

In [4]:
df = pd.read_csv('company-classification.zip')
df.head()

Unnamed: 0,Category,website,company_name,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description
0,Commercial Services & Supplies,bipelectric.com,bip dipietro electric inc,Electrici...,,,,,"electricians vero beach, vero beach electrical...","Providing quality, reliable full service resid..."
1,Healthcare,eliasmedical.com,elias medical,site map | en español Elias Medical h...,Offering Bakersfield family medical care from ...,Welcome to ELIAS MEDICAL#sep#Family Medical Pr...,Get To Know Elias Medical#sep#Family Medical P...,,Elias Medical bakersfield ca family doctor med...,For the best value in Bakersfield skin care tr...
2,Commercial Services & Supplies,koopsoverheaddoors.com,koops overhead doors,Home About Us Garage Door Repair & Servi...,,Customer Reviews#sep#Welcome to Koops Overhead...,,,"Koops Overhead Doors, Albany Garage Doors, Tro...","Koops Overhead Doors specializes in the sales,..."
3,Healthcare,midtowneyes.com,midtown eyecare,918-599-0202 Type Size...,,Welcome to our practice!,,,,We would like to welcome you to Midtown Eyecar...
4,Commercial Services & Supplies,reprosecurity.co.uk,repro security ltd,Simply fill out our form below...,,Welcome to REPRO SECURITY Ltd,,,,Repro Security provide a range of tailor made ...


In [5]:
# Remove all column except Category and meta_description 
# Rename meta_description to Description
df.drop(['company_name', 'homepage_text',	'h1',	'h2',	'h3', 'website', 'nav_link_text',	'meta_keywords'], axis=1, inplace=True)
df.rename(columns={'meta_description' : 'Description'}, inplace=True)
df.head()

Unnamed: 0,Category,Description
0,Commercial Services & Supplies,"Providing quality, reliable full service resid..."
1,Healthcare,For the best value in Bakersfield skin care tr...
2,Commercial Services & Supplies,"Koops Overhead Doors specializes in the sales,..."
3,Healthcare,We would like to welcome you to Midtown Eyecar...
4,Commercial Services & Supplies,Repro Security provide a range of tailor made ...


In [6]:
# Category with more than 1 word will raise an error during training
# So have to rename or remove them
err = ['Commercial Services & Supplies',
       'Energy & Utilities', 'Professional Services',
       'Corporate Services', 'Media, Marketing & Sales',
       'Information Technology', 'Consumer Discretionary', 
       'Transportation & Logistics', 'Consumer Staples']

In [7]:
# We will remove them to make the database smaller (also easier)
for x in range(len(err)):
  df.drop(index=df[df['Category'] == err[x]].index, inplace=True)

In [8]:
# Drop any row with NaN value
df.dropna(inplace=True)

In [9]:
# Shuffle and Reset index
df = df.sample(frac=1).reset_index(drop=True)

In [10]:
df.head()

Unnamed: 0,Category,Description
0,Healthcare,Principal Medical Group is a concierge medical...
1,Healthcare,Grafton Eye Center offers professional eye doc...
2,Financials,"We are a one-stop insurance provider, speciali..."
3,Materials,DC Fine Chemicals offer semi bulk chemicals fo...
4,Healthcare,Are you planning to get an eye exam? Learn mor...


## Build the model

reference : https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%202%20-%20Exercise%20-%20Answer.ipynb

In [11]:
# parameter setting
vocab_size = 1000
embedding_dim = 6 # 16 -> 91%
max_length = 50 # 128 -> 91%
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_portion = .8

In [12]:
sentences = []
labels = [x for x in df['Category']]
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [13]:
for x in df['Description']:
  sentence = x
  for word in stopwords:
    token = " " + word + " "
    sentence = sentence.replace(token, " ")
  sentences.append(sentence)

print(len(labels))
print(len(sentences))

18303
18303


In [14]:
train_size = int(len(sentences) * training_portion)

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

14642
14642
14642
3661
3661


In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

31
50
10
50
27
50


In [16]:
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_length)

print(len(validation_sequences))
print(validation_padded.shape)

3661
(3661, 50)


In [17]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

[1]
[1]
[2]
(14642, 1)
[3]
[4]
[3]
(3661, 1)


In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    # tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 6)             6000      
_________________________________________________________________
global_average_pooling1d (Gl (None, 6)                 0         
_________________________________________________________________
dense (Dense)                (None, 16)                112       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 85        
Total params: 6,197
Trainable params: 6,197
Non-trainable params: 0
_________________________________________________________________


In [19]:
num_epochs = 15
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Test Predict

In [None]:
# check word token of the category
label_tokenizer.index_word

In [None]:
text = "start your business by planning it together with us" 
test_string = [text]
test = tokenizer.texts_to_sequences(test_string)
test_padded = pad_sequences(test, padding=padding_type, maxlen=max_length)

In [None]:
test_padded

In [None]:
res = model.predict(test_padded)
result = np.argmax(res, axis=1)
result

## Save the model

In [20]:
!mkdir -p saved_model
model.save('saved_model/my_model')

INFO:tensorflow:Assets written to: saved_model/my_model/assets
