In [9]:
import json
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import pad_sequences
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from keras.layers import Masking
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
import numpy as np
features_d1 = []
labels_d1 = []
labels_domain = []
# Open file for reading
with open('domain1_train.json', 'r') as f:
    for line in f:
        # Parse the JSON line into a Python dictionary
        obj = json.loads(line)
        labels_d1.append(obj['label'])
        text_without_zeros = [x for x in obj['text'] if x != 0]
        features_d1.append(text_without_zeros)

        labels_domain.append(0)

features_d2 = []
labels_d2 = []
# Open file for reading
with open('domain2_train.json', 'r') as f:
    for line in f:
        # Parse the JSON line into a Python dictionary
        obj = json.loads(line)
        text_without_zeros = [x for x in obj['text'] if x != 0]
        features_d2.append(text_without_zeros)
        labels_d2.append(obj['label'])
        labels_domain.append(1)


In [11]:
# Split the data
features = features_d1+features_d2
padded_features = pad_sequences(features, padding='pre', value=0)
labels_domain = to_categorical(labels_domain, num_classes=2)

X_train, X_test, y_domain_train, y_domain_test = train_test_split(
    padded_features, labels_domain, test_size=0.2, random_state=42
)


In [12]:
y_domain_train

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [24]:
from keras.models import Model
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, GlobalMaxPooling1D, Dense, Dropout, Masking, Input, Bidirectional
from keras.regularizers import l2
import tensorflow as tf
import sys

# Constants
vocab_size = 5000  # Vocabulary size
embedding_dim = 100 # Embedding dimension
n_domain_classes = 2 # Classification for the domain task

# Input Layer
input_layer = Input(shape=(None,))

# Embedding Layer
embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True)(input_layer)



# Conv1D Layer
conv1d_layer = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer)

# Dropout Layer
dropout_conv = Dropout(0.3)(conv1d_layer)


# Max Pooling Layer
max_pooling = MaxPooling1D(pool_size=2)(dropout_conv)

# LSTM Layer
lstm_layer = LSTM(32)(max_pooling)

# Dropout Layer
dropout_lstm = Dropout(0.3)(lstm_layer)


# New Dense Layer
dense_layer = Dense(16, activation='relu')(dropout_lstm)
dense_dropout = Dropout(0.5)(dense_layer)


# Domain output
domain_output = Dense(n_domain_classes, activation='sigmoid', name='domain')(dense_dropout)

# Combined model
model_domain = Model(inputs=input_layer, outputs=domain_output)




model_domain.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



# Model summary
model_domain.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_7 (Embedding)     (None, None, 100)         500000    
                                                                 
 conv1d (Conv1D)             (None, None, 32)          9632      
                                                                 
 dropout_14 (Dropout)        (None, None, 32)          0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 32)         0         
 )                                                               
                                                                 
 lstm_7 (LSTM)               (None, 32)                8320      
                                                           

In [25]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_domain.fit(X_train, y_domain_train, validation_data=(X_test, y_domain_test), epochs=50, callbacks=[early_stopping])

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


<keras.callbacks.History at 0x1daa07ff6a0>

In [26]:
# Constants
vocab_size = 5000  # Vocabulary size
embedding_dim = 100 # Embedding dimension
n_domain_classes = 2 # Classification for the domain task

# Input Layer
input_layer = Input(shape=(None,))

# Embedding Layer
embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True)(input_layer)


# LSTM Layer
lstm_layer = Bidirectional(LSTM(32))(embedding_layer)

# Dropout Layer
dropout_lstm = Dropout(0.3)(lstm_layer)


# New Dense Layer
dense_layer = Dense(16, activation='relu')(dropout_lstm)
dense_dropout = Dropout(0.3)(dense_layer)


# Domain output
domain_output = Dense(n_domain_classes, activation='sigmoid', name='domain')(dense_dropout)

# Combined model
model_domain1 = Model(inputs=input_layer, outputs=domain_output)



model_domain1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



# Model summary
model_domain1.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_8 (Embedding)     (None, None, 100)         500000    
                                                                 
 bidirectional (Bidirectiona  (None, 64)               34048     
 l)                                                              
                                                                 
 dropout_17 (Dropout)        (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 16)                1040      
                                                                 
 dropout_18 (Dropout)        (None, 16)                0         
                                                           

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

padded_features = pad_sequences(features_d1, padding='pre', value=0)
labels_domain = to_categorical(labels_d1, num_classes=2)

X_train_domain1, X_test_domain1, y_train_domain1, y_test_domain1 = train_test_split(
    padded_features, labels_domain, test_size=0.2, random_state=42
)
model_domain1.fit(X_train, y_domain_train, validation_data=(X_test, y_domain_test), epochs=50, callbacks=[early_stopping])