In [384]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif,  chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [385]:
dom1 = []
dom2 = []
test_set = []
with open('data/domain1_train.json', 'r') as file:
    for line in file:
        dom1.append(json.loads(line))
        
with open('data/domain2_train.json', 'r') as file:
    for line in file:
        dom2.append(json.loads(line))
        
        
with open('data/test_set.json', 'r') as file:
    for line in file:
        test_set.append(json.loads(line))
        
dom1 = pd.DataFrame.from_dict(dom1)
dom2 = pd.DataFrame.from_dict(dom2)
dom2 = dom2[dom2['text'].apply(len) > 0]
kaggle_set = pd.DataFrame.from_dict(test_set)
        

In [386]:
print(len(dom1))
print(len(dom2))
print(len(test_set))

19500
14899
1000


In [387]:
dom1['label'].value_counts()
print(len(dom2.loc[dom2['label']==0]))
print(len(dom2.loc[dom2['label']==1]))

12750
2149


### train model with dom2

In [397]:
d2_train,d2_test,y2_train,y2_test = train_test_split(dom2,dom2['label'],test_size=0.2)
print(d2_train.shape)
print(d2_test.shape)

(11919, 3)
(2980, 3)


In [398]:
dom2_80_percent_count = int(len(dom2) * 0.8)

dom2_80_percent = dom2.iloc[:dom2_80_percent_count]

merged_dataset = pd.concat([dom1, dom2_80_percent], ignore_index=True)

In [399]:
d1_train, d1_test,y1_train,y1_test = train_test_split(merged_dataset, merged_dataset['label'],test_size=0.2)
print(d1_train.shape)
print(d1_test.shape)

(25135, 3)
(6284, 3)


In [400]:
from gensim.models import Word2Vec
import numpy as np


X = d2_train['text'].tolist()
X1 =d2_test['text'].tolist()
max_length = max([len(seq) for seq in X])
max_length1 = max([len(seq) for seq in X1])
word2vec = Word2Vec(X, vector_size=300, window=10, min_count=1, workers=4)
word2vec1 = Word2Vec(X1, vector_size=300, window=10, min_count=1, workers=4)

def get_text_vector(text, word2vec_model):
    vector_list = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv.index_to_key]
    if len(vector_list) == 0:
        return np.zeros(word2vec.vector_size)
    return np.mean(vector_list, axis=0)

X_train = np.array([get_text_vector(text, word2vec) for text in X])
X_test = np.array([get_text_vector(text, word2vec1) for text in X1])


In [401]:

X2 = d1_train['text'].tolist()
X3 =d1_test['text'].tolist()
max_length = max([len(seq) for seq in X2])
max_length1 = max([len(seq) for seq in X3])
word2vec = Word2Vec(X2, vector_size=300, window=10, min_count=1, workers=4)
word2vec1 = Word2Vec(X3, vector_size=300, window=10, min_count=1, workers=4)

def get_text_vector(text, word2vec_model):
    vector_list = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv.index_to_key]
    if len(vector_list) == 0:
        return np.zeros(word2vec.vector_size)
    return np.mean(vector_list, axis=0)

X_train1 = np.array([get_text_vector(text, word2vec) for text in X2])
X_test1 = np.array([get_text_vector(text, word2vec1) for text in X3])



In [402]:
X_train.shape


(11919, 300)

In [415]:
X_train = X_train.reshape((11919, 300, 1))
X_test = X_test.reshape((2980, 300, 1))
X_train1 = X_train1.reshape((25135, 300, 1))
X_test1 = X_test1.reshape((6284, 300, 1))



In [416]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import backend as K
import tensorflow as tf


model = Sequential([
    Conv1D(32, 3, padding='same', input_shape=(300, 1)),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    MaxPooling1D(2),

    Conv1D(64, 3, padding='same'),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    MaxPooling1D(2),

    Conv1D(128, 3, padding='same'),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    MaxPooling1D(2),

    Flatten(),

    Dense(128, activation='relu'),
    Dropout(0.5),
    
    Dense(64, activation='relu'),
    Dropout(0.5),
    
    Dense(2, activation='softmax')
])

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

model.compile(optimizer=optimizer, 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy', f1_metric])

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y2_train), y=y2_train)


class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}

model.fit(X_train, y2_train, epochs=100, batch_size=64, class_weight=class_weights_dict)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2af3418a490>

In [None]:
test_loss, test_acc, test_f1 = model.evaluate(X_test, y2_test, verbose=2)

In [418]:
from tensorflow.keras.layers import Dropout, BatchNormalization

for layer in model.layers[:-1]:
    layer.trainable = False


x = Dense(128, activation='relu')(model.layers[-2].output)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

output_layer = Dense(2, activation='softmax')(x)

new_model = Model(inputs=model.input, outputs=output_layer)


new_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy', f1_metric])

class_weights1 = compute_class_weight(class_weight='balanced', classes=np.unique(y1_train), y=y1_train)
class_weights_dict1 = {i : class_weights1[i] for i in range(len(class_weights1))}


new_model.fit(X_train1, y1_train, epochs=20, batch_size=64, class_weight=class_weights_dict1)






Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2af38a98f70>

In [419]:
test_loss, test_acc, test_f1 = new_model.evaluate(X_test1, y1_test, verbose=2)
print("\nTest accuracy:", test_acc)
print("Test F1 Score:", test_f1)

197/197 - 2s - loss: 0.6961 - accuracy: 0.5821 - f1_metric: 0.5436 - 2s/epoch - 11ms/step

Test accuracy: 0.5821133255958557
Test F1 Score: 0.543595552444458


In [420]:
X4 = kaggle_set['text'].tolist()

max_length = max([len(seq) for seq in X4])

word2vec = Word2Vec(X4, vector_size=300, window=10, min_count=1, workers=4)

kaggle_set_new = np.array([get_text_vector(text, word2vec) for text in X4])


In [None]:
kaggle_set_new.reshape((1000, 300, 1))

In [429]:
predictions = new_model.predict(kaggle_set_new)
predictions = np.argmax(predictions, axis=1)



In [None]:
import pandas as pd

df = pd.DataFrame({'id': range(len(predictions)), 'class': predictions})

df.to_csv('predictions2.csv', index=False)
