In [38]:
import pandas as pd
import sklearn
import numpy as np

data = pd.read_excel('../Adatok/tisztitott_adat.xlsx', header=0,index_col=0)
data

Unnamed: 0,date,text,president,label
0,2012-11-05,president obama tells the story of fired up re...,Obama,0
1,2012-11-06,election day is here confirm your polling plac...,Obama,1
2,2012-11-07,thank you president obama in his victory speec...,Obama,2
3,2012-11-08,the definition of hope is you still believe ev...,Obama,2
4,2012-11-09,what bobby kennedy called the ripples of hope ...,Obama,0
...,...,...,...,...
1823,2020-06-11,our great national guard troops who took care ...,Trump,2
1824,2020-06-12,people have no idea how fake the lamestream me...,Trump,2
1825,2020-06-15,i’ve done more in less than 4 years than biden...,Trump,1
1826,2020-06-16,wow may retail sales show biggest one-month in...,Trump,0


In [39]:
from sklearn.model_selection import train_test_split

label = data['label']
attrs = data['text']

X_train, X_test, y_train, y_test = train_test_split(
        attrs, label, test_size=0.25, random_state=1000)


In [40]:
X_train

183     the first week of #actionaugust is coming to a...
564     it's time to end the era of manufactured crise...
453     watch a special pep talk for on women's equali...
1243    as we come together to celebrate the extraordi...
251     continue to stand up against discrimination ad...
                              ...                        
1275    our border laws are very weak while those of m...
1728    the democrat controlled house never even asked...
71      for the sake of future gun victimslawmakers sh...
599     live the white house is hosting the eighth ann...
1459    just returned from visiting our troops in iraq...
Name: text, Length: 1371, dtype: object

In [41]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

tokenizer.fit_on_texts(data['text'])

max_length = max([len(s.split()) for s in data['text']])

vocab_size = len(tokenizer.word_index) + 1

X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_token, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_token, maxlen=max_length, padding='post')


# .............................


In [93]:
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
clear_session()

## Basic NN

In [94]:
input_dim = X_train_pad.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['sparse_categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                11520     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 33        
Total params: 11,553
Trainable params: 11,553
Non-trainable params: 0
_________________________________________________________________


In [95]:
history = model.fit(X_train_pad, y_train,
                    epochs=25,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.5799
Testing Accuracy:  0.5886


In [136]:
from keras.backend import clear_session
clear_session()

## Embedding Layer + Flatten added


In [137]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_length,
                           trainable=True))
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.6))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1151, 50)          738700    
_________________________________________________________________
flatten (Flatten)            (None, 57550)             0         
_________________________________________________________________
dropout (Dropout)            (None, 57550)             0         
_________________________________________________________________
dense (Dense)                (None, 10)                575510    
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 33        
Total params: 1,314,243
Trainable params: 1,314,243
Non-trainable params: 0
______________________________________________

In [138]:
history = model.fit(X_train_pad, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.7433
Testing Accuracy:  0.5886


In [104]:
from keras.backend import clear_session
clear_session()

## Embedding + Pooling added


In [110]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_length,
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1151, 50)          738700    
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                510       
_________________________________________________________________
dropout (Dropout)            (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 33        
Total params: 739,243
Trainable params: 739,243
Non-trainable params: 0
_________________________________________________________________


In [111]:
history = model.fit(X_train_pad, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.7411
Testing Accuracy:  0.5799


In [128]:
from keras.backend import clear_session
clear_session()

## CNN

In [127]:
embedding_dim = 50

model = Sequential()
model = Sequential()
model.add(layers.Embedding(vocab_size,embedding_dim, input_length=max_length, trainable=True))
model.add(layers.Dropout(0.2))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dropout(0.3))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])
model.summary()


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1151, 50)          738700    
_________________________________________________________________
dropout_3 (Dropout)          (None, 1151, 50)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1147, 128)         32128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1290      
_________________________________________________________________
dropout_5 (Dropout)          (None, 10)               

In [129]:
history = model.fit(X_train_pad, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test_pad, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.7739
Testing Accuracy:  0.5624


In [89]:
from keras.backend import clear_session
clear_session()