In [1]:
!git clone https://github.com/YafetPontoh/capstone.git

fatal: destination path 'capstone' already exists and is not an empty directory.


In [2]:
%cd capstone

/content/capstone


In [3]:
!pip install nltk



In [46]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

# Train Test
from sklearn.model_selection import train_test_split

# Encode
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import numpy as np

# Adam
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.regularizers import l1, l2, l1_l2

# Bidirectional
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
data_path = '/content/capstone/dataset.csv'
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Sentence,Type
0,"Ali, who was two at the time, loved the story ...",Auditory
1,"Look at my dad, spiffed up in jodhpurs, ready ...",Auditory
2,Believe me it's far more difficult to know wha...,Auditory
3,"The Surrealists loved bad movies, seeing them ...",Auditory
4,"He idolised prize-fighters, regarded racketeer...",Auditory


## Assessing Data


In [6]:
class DataAssessing:
  def __init__(self, data):
    self.data = data

  def assessing_data(self):
    self.ismissing()
    self.isduplicate()
    self.info()
    self.describe()

  def ismissing(self):
    print('Jumlah missing value: ')
    print(self.data.isna().sum())
    self.pembatas()

  def info(self):
    print(self.data.info())
    self.pembatas()

  def describe(self):
    print(self.data.describe(include='all'))
    self.pembatas()

  def isduplicate(self):
    print('data duplikat: {}'.format(self.data.duplicated().sum()))
    self.pembatas()

  def pembatas(self):
    print('--------------------------')

assessing_data = DataAssessing(data)
assessing_data.assessing_data()

Jumlah missing value: 
Sentence    0
Type        0
dtype: int64
--------------------------
data duplikat: 758
--------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15450 entries, 0 to 15449
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  15450 non-null  object
 1   Type      15450 non-null  object
dtypes: object(2)
memory usage: 241.5+ KB
None
--------------------------
                   Sentence    Type
count                 15450   15450
unique                14594       3
top     Show More Sentences  Visual
freq                    667    5827
--------------------------


## Cleaning Data

In [7]:
class DataCleaning:
  def __init__(self, data):
    self.data = data

  def cleaning (self, strategy):
    self.missing_value(strategy)
    self.duplicate_data()
    return self.data

  def missing_value(self, strategy):
    to_fill = ['ffill','bfill','mean','median','mode']
    if strategy in to_fill:
      self.data.fillna(method=strategy, inplace=True)
    elif strategy not in to_fill:
      if strategy == 'drop':
        self.data.dropna(inplace=True)
      else:
        raise ValueError(f"Valid options are: {', '.join(valid_strategies)}")
  def duplicate_data(self):
    self.data.drop_duplicates(inplace=True)

In [8]:
DataCleaning = DataCleaning(data)
data = DataCleaning.cleaning(strategy='drop')

In [9]:
DataAssessing(data).assessing_data()

Jumlah missing value: 
Sentence    0
Type        0
dtype: int64
--------------------------
data duplikat: 0
--------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 14692 entries, 0 to 15449
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  14692 non-null  object
 1   Type      14692 non-null  object
dtypes: object(2)
memory usage: 344.3+ KB
None
--------------------------
                                                 Sentence    Type
count                                               14692   14692
unique                                              14594       3
top     A sound film is a motion picture with synchron...  Visual
freq                                                    3    5548
--------------------------


## Data Preprocessing

In [10]:
def lower_clean(text):
  text = text.lower()
  text = re.sub(r'[^a-z ]','',text)
  tokens = word_tokenize(text)
  return ' '.join(tokens)

X = data['Sentence'].apply(lower_clean)
y = data['Type']

In [11]:
X

Unnamed: 0,Sentence
0,ali who was two at the time loved the story ab...
1,look at my dad spiffed up in jodhpurs ready to...
2,believe me its far more difficult to know what...
3,the surrealists loved bad movies seeing them a...
4,he idolised prizefighters regarded racketeers ...
...,...
15445,her appearance as the conwoman trying to fleec...
15446,however mostly in these classes i am trying to...
15447,no i can not lay down with a hanger hooked on ...
15448,in the meantime im trying to keep the organiza...


In [12]:
# Encoder
y = LabelEncoder().fit_transform(y)
y = to_categorical(y)

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [14]:
def tokenize(X_train, X_test):
  # Tokenize
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(X_train)
  sequences = tokenizer.texts_to_sequences(X_train)
  sequences_test = tokenizer.texts_to_sequences(X_test)

  #Pad Sequences
  max_len = max(len(seq) for seq in sequences)
  sequences = pad_sequences(sequences, maxlen=max_len, truncating='pre')
  sequences_test = pad_sequences(sequences_test, maxlen=max_len, truncating='pre')
  return sequences, sequences_test, tokenizer, max_len


In [15]:
sequences, sequences_test, tokenizer, max_len = tokenize(X_train, X_test)

In [67]:
# LSTM Model
def create_lstm(vocab_size, max_len):
  model = Sequential()
  model.add(Embedding(vocab_size, output_dim=200, input_shape=(max_len,)))
  model.add(LSTM(128, dropout=0.1, recurrent_dropout=0.1, return_sequences=True, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)))
  model.add(LSTM(64,kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)))
  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.3))
  model.add(Dense(3, activation='softmax'))

  model.compile(optimizer=Adam(learning_rate=0.001),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

  return model

In [68]:
vocab_size = len(tokenizer.index_word) + 1
max_len = max_len
model = create_lstm(vocab_size, max_len)

In [69]:
model.summary()

In [70]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


history_models = model.fit(sequences, Y_train,
                    epochs=500,
                    validation_data=(sequences_test, Y_test),
                    verbose=1,
                    batch_size=32,
                    validation_split=0.2,
                    validation_freq=1,
                    shuffle=True,
                    callbacks=[early_stopping])

Epoch 1/500
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 213ms/step - accuracy: 0.4039 - loss: 2.7180 - val_accuracy: 0.6887 - val_loss: 0.6074
Epoch 2/500
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 214ms/step - accuracy: 0.7740 - loss: 0.5055 - val_accuracy: 0.9343 - val_loss: 0.2918
Epoch 3/500
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 208ms/step - accuracy: 0.9764 - loss: 0.1591 - val_accuracy: 0.9330 - val_loss: 0.2421
Epoch 4/500
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 213ms/step - accuracy: 0.9886 - loss: 0.0937 - val_accuracy: 0.9275 - val_loss: 0.2509
Epoch 5/500
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 209ms/step - accuracy: 0.9905 - loss: 0.0707 - val_accuracy: 0.9500 - val_loss: 0.2025
Epoch 6/500
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 214ms/step - accuracy: 0.9917 - loss: 0.0581 - val_accuracy: 0.9350 - val_loss: 0.2238
Epoc

In [73]:
new_texts = ["I enjoy learning new skills", "Deep learning is fascinating"]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_sequences_padded = pad_sequences(new_sequences, maxlen=max_len)

predictions = model.predict(new_sequences_padded)
predicted_classes = predictions.argmax(axis=-1)

print("Predictions:", predictions)
print("Predicted Classes:", predicted_classes)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
Predictions: [[0.2451672  0.713888   0.0409448 ]
 [0.66289306 0.27225596 0.06485103]]
Predicted Classes: [1 0]


In [74]:
# Save Model
model.save('model_learning_style.h5')

