In [1]:
import pandas as pd
import sklearn
import numpy as np

data = pd.read_excel('../Adatok/tisztitott_adat_nonconcat.xlsx', header=0,index_col=0)
data

Unnamed: 0,date,text,president,label
0,2012-11-05,president obama tells the story of fired up re...,Obama,1
1,2012-11-06,election day is here confirm your polling plac...,Obama,1
2,2012-11-06,it’s election day this is your last chance to ...,Obama,1
3,2012-11-06,at the final rally of his final campaign last ...,Obama,1
4,2012-11-06,25 reasons that 25 people are voting for presi...,Obama,1
...,...,...,...,...
12104,2020-06-16,true …,Trump,-1
12105,2020-06-16,a great woman her son is looking down from hea...,Trump,-1
12106,2020-06-16,96% approval rating in the republican party th...,Trump,-1
12107,2020-06-17,joe biden was a total failure in government he...,Trump,-1


In [2]:
from sklearn.model_selection import train_test_split

label = data['label']
attrs = data['text']

X_train, X_test, y_train, y_test = train_test_split(
        attrs, label, test_size=0.25, random_state=1000)


In [7]:
X_train

10087                                      thank you     …
5851     just arrived at the pensacola bay center join ...
341      stop gun violence so no other mother & father ...
3831     get the facts on one of today's most consequen...
4639     get the facts not the fluffjoin the ofa truth ...
                               ...                        
2398     in tough times we don't abandon american famil...
3776     jon traveled 3000 miles to change the conversa...
6215     special council is told to find crimes whether...
4695     senate leaders are putting politics above thei...
9651                              thank you working hard  
Name: text, Length: 9081, dtype: object

In [3]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

tokenizer.fit_on_texts(data['text'])

max_length = max([len(s.split()) for s in data['text']])

vocab_size = len(tokenizer.word_index) + 1

X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_token, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_token, maxlen=max_length, padding='post')


# .............................


In [5]:
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
clear_session()

## Basic NN

In [7]:
input_dim = X_train_pad.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                2880      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 2,891
Trainable params: 2,891
Non-trainable params: 0
_________________________________________________________________


## Embedding Layer + Flatten added


In [8]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_length,
                           trainable=True))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 287, 50)           686950    
_________________________________________________________________
flatten (Flatten)            (None, 14350)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                143510    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 830,471
Trainable params: 830,471
Non-trainable params: 0
_________________________________________________________________


## Embedding + Pooling added


In [10]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_length,
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 287, 50)           686950    
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 687,471
Trainable params: 687,471
Non-trainable params: 0
_________________________________________________________________


## CNN

In [11]:
embedding_dim = 50

model = Sequential()
model = Sequential()
model.add(layers.Embedding(vocab_size,embedding_dim, input_length=max_length, trainable=True))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 287, 50)           686950    
_________________________________________________________________
conv1d (Conv1D)              (None, 283, 128)          32128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 720,379
Trainable params: 720,379
Non-trainable params: 0
_________________________________________________________________


In [19]:
# history = model.fit(X_train_pad, y_train,
#                     epochs=25,
#                     verbose=False,
#                     validation_data=(X_test_pad, y_test),
#                     batch_size=10)
# loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
# print("Training Accuracy: {:.4f}".format(accuracy))
# loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
# print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.1386
Testing Accuracy:  0.0446
