In [1]:
import random
import copy
import time
import gc
import torch
import pandas as pd

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from sklearn.metrics import f1_score
import os 

from sklearn.preprocessing import StandardScaler
from multiprocessing import  Pool
from functools import partial
from sklearn.decomposition import PCA
import torch as t
import torch.nn.functional as F

import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
import pickle

Using TensorFlow backend.


In [10]:
embed_size = 300
batch_size = 512
max_features = 9000

In [3]:
import re

def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text    = re.sub(pattern, ' ', x)
    return text

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
        x = re.sub('[0-9]{1}', '', x)
    return x


mapping = {'gk': 'tidak', 'ga': 'tidak', 'gak': 'tidak', 'tdk': 'tidak', 'sm': 'sama', 
             'sma': 'sama', 'disruh': 'disuruh', 'yg': 'yang', 'dr':'dari', 'dri': 'dari', 
             'udh': 'sudah', 'sdh': 'sudah', 'knp': 'kenapa', 'pdhl': 'padahal', 'tp': 'tapi',
             'tpi': 'tapi', 'pd': 'pada', 'td': 'tadi', 'tdi': 'tadi'}


def replace_norms(text):
    pattern = re.compile(r'\b(' + '|'.join(mapping.keys()) + r')\b')
    return pattern.sub(lambda x: mapping[x.group()], text)

In [4]:
with open('preprocessing.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('encoder.pickle', 'rb') as handle:
    le = pickle.load(handle)
    
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
with open('wordembedd.pickle', 'rb') as handle:
    embedding_matrix = pickle.load(handle)

In [5]:
class BiLSTM(nn.Module):
    
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.hidden_size = 64
        drp = 0.1
        n_classes = len(le.classes_)
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(self.hidden_size*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drp)
        self.out = nn.Linear(64, n_classes)


    def forward(self, x):
        #rint(x.size())
        h_embedding = self.embedding(x)
        #_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
        h_lstm, _ = self.lstm(h_embedding)
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out
model = BiLSTM()

In [6]:
model.load_state_dict(torch.load('rnn_model.pt'))

<All keys matched successfully>

In [7]:
model.eval()

BiLSTM(
  (embedding): Embedding(9000, 300)
  (lstm): LSTM(300, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=256, out_features=64, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (out): Linear(in_features=64, out_features=8, bias=True)
)

In [8]:
def predict_single(x):    
    # lower the text
    x = x.lower()
    # Clean the text
    x =  clean_text(x)
    # Clean numbers
    x =  clean_numbers(x)
    # Clean Contractions
    x = replace_norms(x)
    print(x)
    # tokenize
    x = tokenizer.texts_to_sequences([x])
    # pad
    x = pad_sequences(x, maxlen=600)
    # create dataset
    x = torch.tensor(x, dtype=torch.long)

    pred = model(x).detach()
    pred = F.softmax(pred).cpu().numpy()

    pred = pred.argmax(axis=1)

    pred = le.classes_[pred]

    return 'Kategori:'+pred[0]

In [11]:
predict_single(x)

admin    kenapa akhir ini kalo saya sudah pesan go ride pake gopay pula  saya kok tidak dapat point ya   mohon bantuannya   




'Kategori:Aplikasi'