In [36]:
import pandas as pd
import numpy as np
import random

import tensorflow as tf

from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPool1D, Flatten, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.optimizers import Adam, SGD

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [43]:
def configure_nn(input_size,
                layers=None, 
                dropout_rate=0, 
                filters=10,
                kernel_size=10,
                stride=10,
                pool_size=2,
                optimizer='Adam',
                loss='categorical_crossentropy',
                 kernel_initializer='lecun_normal',
                 kernel_regularizer=tf.keras.regularizers.L2(0.01)
                ):
    """ Layers argument shape:
    [[number of nodes, activate function], 
    [number of nodes, activate function],
    ...]
    
    """
        
    model = Sequential()
    model.add(Embedding(10000, 10, input_length=15))
    model.add(Conv1D(filters=filters, kernel_size=kernel_size,
                    strides=10, padding='same', activation='relu'))
    
    model.add(MaxPool1D(pool_size=pool_size))
    model.add(Flatten())
    
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate))
    
    if layers != None:
        for node in layers[1:]:
            model.add(Dense(node[0], activation=node[1], kernel_initializer=kernel_initializer, 
                            kernel_regularizer=kernel_regularizer))
        
    model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, 
                    kernel_regularizer=kernel_regularizer))
    
    model.compile(loss=loss, 
                 optimizer=optimizer,
                 metrics=tf.metrics.AUC(curve='ROC'))
    
    return model

In [44]:
model = configure_nn(15)

In [45]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 15, 10)            100000    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 2, 10)             1010      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 1, 10)             0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 101,021
Trainable params: 101,021
Non-trainable params: 0
_________________________________________________________________


In [32]:
data = pd.read_csv('../tokenized_data.csv')

In [34]:
features = data.columns.tolist()
features.remove('sentiment')

In [38]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data['sentiment'], test_size=0.33, random_state=42)

In [46]:
model.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10


InvalidArgumentError:  indices[0,0] = 112215 is not in [0, 10000)
	 [[node sequential_8/embedding_7/embedding_lookup (defined at <ipython-input-46-e5060ba904bd>:1) ]] [Op:__inference_train_function_2591]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_8/embedding_7/embedding_lookup:
 sequential_8/embedding_7/embedding_lookup/2182 (defined at /home/jakub/anaconda3/lib/python3.8/contextlib.py:113)

Function call stack:
train_function
