In [28]:
import pandas as pd
import sklearn
import numpy as np

data = pd.read_excel('../Adatok/tisztitott_adat.xlsx', header=0,index_col=0)
data

Unnamed: 0,date,text,president,label
0,2012-11-05,president obama tells the story of fired up re...,Obama,0
1,2012-11-06,election day is here confirm your polling plac...,Obama,0
2,2012-11-07,thank you president obama in his victory speec...,Obama,-1
3,2012-11-08,the definition of hope is you still believe ev...,Obama,-1
4,2012-11-09,what bobby kennedy called the ripples of hope ...,Obama,0
...,...,...,...,...
1823,2020-06-11,our great national guard troops who took care ...,Trump,-1
1824,2020-06-12,people have no idea how fake the lamestream me...,Trump,-1
1825,2020-06-15,i’ve done more in less than 4 years than biden...,Trump,1
1826,2020-06-16,wow may retail sales show biggest one-month in...,Trump,0


In [30]:
from sklearn.model_selection import train_test_split

label = data['label']
attrs = data['text']

X_train, X_test, y_train, y_test = train_test_split(
        attrs, label, test_size=0.25, random_state=1000)


In [32]:
X_train

183     the first week of #actionaugust is coming to a...
564     it's time to end the era of manufactured crise...
453     watch a special pep talk for on women's equali...
1243    as we come together to celebrate the extraordi...
251     continue to stand up against discrimination ad...
                              ...                        
1275    our border laws are very weak while those of m...
1728    the democrat controlled house never even asked...
71      for the sake of future gun victimslawmakers sh...
599     live the white house is hosting the eighth ann...
1459    just returned from visiting our troops in iraq...
Name: text, Length: 1371, dtype: object

In [None]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

tokenizer.fit_on_texts(data['text'])

max_length = max([len(s.split()) for s in data['text']])

vocab_size = len(tokenizer.word_index) + 1

X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_token, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_token, maxlen=max_length, padding='post')


# .............................


In [29]:
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
clear_session()

## Basic NN

In [31]:
input_dim = X_train_pad.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                11520     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 11,531
Trainable params: 11,531
Non-trainable params: 0
_________________________________________________________________


In [33]:
history = model.fit(X_train_pad, y_train,
                    epochs=25,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

InvalidArgumentError:  Received a label value of -1 which is outside the valid range of [0, 1).  Label values: 0 1 0 0 0 1 -1 0 0 0
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-33-4d45ed4457fa>:1) ]] [Op:__inference_train_function_44065]

Function call stack:
train_function


In [35]:
from keras.backend import clear_session
clear_session()

## Embedding Layer + Flatten added


In [36]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_length,
                           trainable=True))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1151, 50)          738700    
_________________________________________________________________
flatten (Flatten)            (None, 57550)             0         
_________________________________________________________________
dense (Dense)                (None, 10)                575510    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 1,314,221
Trainable params: 1,314,221
Non-trainable params: 0
_________________________________________________________________


In [37]:
history = model.fit(X_train_pad, y_train,
                    epochs=25,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

InvalidArgumentError:  Received a label value of -1 which is outside the valid range of [0, 1).  Label values: 0 0 -1 0 0 -1 0 -1 0 0
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-37-4d45ed4457fa>:1) ]] [Op:__inference_train_function_44794]

Function call stack:
train_function


In [None]:
from keras.backend import clear_session
clear_session()

## Embedding + Pooling added


In [12]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_length,
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1151, 50)          738700    
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                510       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 739,221
Trainable params: 739,221
Non-trainable params: 0
_________________________________________________________________


In [13]:
history = model.fit(X_train_pad, y_train,
                    epochs=25,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.8293
Testing Accuracy:  0.7243


In [None]:
from keras.backend import clear_session
clear_session()

## CNN

In [15]:
embedding_dim = 50

model = Sequential()
model = Sequential()
model.add(layers.Embedding(vocab_size,embedding_dim, input_length=max_length, trainable=True))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1151, 50)          738700    
_________________________________________________________________
conv1d (Conv1D)              (None, 1147, 128)         32128     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 772,129
Trainable params: 772,129
Non-trainable params: 0
_________________________________________________________________


In [16]:
history = model.fit(X_train_pad, y_train,
                    epochs=25,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.7206
Testing Accuracy:  0.7243


In [None]:
from keras.backend import clear_session
clear_session()