In [27]:
import keras
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
train_data_path = 'data/train_4000.csv'
test_data_path = 'data/twitter_scraped_cleaned.csv'

first_n_words = 200

# Read raw data
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

train_sentences = df_train['sequence'].values
test_sentences = df_test['text'].values

# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 100 # choose based on statistics, for example 100 to 200
padding_type='post'
trunc_type='post'

# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

print(test_padded)

[[   1  320   28 ...    0    0    0]
 [ 474  680   16 ...    0    0    0]
 [ 777 2411   67 ...    0    0    0]
 ...
 [  15   58  139 ...    0    0    0]
 [  63   67   19 ...    0    0    0]
 [ 139  294   31 ...    0    0    0]]


In [29]:
# It can be used to reconstruct the model identically.
model = keras.models.load_model("checkpoints/lstm-base-uncased_4000_0_best.h5")

prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0

pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
df_test['predicted'] = pred_labels
df_test['confidence'] = prediction



In [30]:
df_test

Unnamed: 0.1,Unnamed: 0,twitter_handle,text,date,predicted,confidence
0,0,YouTube,coachella gonna be insane this year april 15 &...,2023-03-14,1,0.778535
1,1,YouTube,stay playing with fire kill this love as if it...,2023-03-14,0,0.069279
2,2,YouTube,omg imagine if this was the setlist,2023-03-14,0,0.189697
3,3,YouTube,new song,2023-03-14,1,0.916027
4,4,YouTube,omg if they drop a new song during their coach...,2023-03-14,1,0.610059
...,...,...,...,...,...,...
55168,55168,Apple,yo ya hasta lo borre,2023-03-13,1,0.774413
55169,55169,Apple,papito dios dame uno,2023-03-12,1,0.553203
55170,55170,Apple,was good why my phone keep hanging up for ever...,2023-03-12,0,0.451958
55171,55171,Apple,what if you wanna pay monthly,2023-03-12,0,0.091919


In [31]:
 df_test.to_csv('results/twitter_lstm_results.csv')

In [32]:
train_data_path = 'data/train_4000.csv'
test_data_path = 'data/Api_data.csv'
first_n_words = 200

# Read raw data
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path, names=['twitter_handle', 'source', 'date', 'text'], header=0)
df_test['text'] = df_test['text'].astype(str)

train_sentences = df_train['sequence'].values
test_sentences = df_test['text'].values

# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 100 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'

# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

print(test_padded)

[[   2  223   27 ...    0    0    0]
 [   2  112  278 ...    0    0    0]
 [  63    2  374 ...    0    0    0]
 ...
 [  29  129 1069 ...    0    0    0]
 [   4    1 1561 ...    0    0    0]
 [   8  120    2 ...    0    0    0]]


In [33]:
# It can be used to reconstruct the model identically.
model = keras.models.load_model("checkpoints/lstm-base-uncased_4000_0_best.h5")

prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0

pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
df_test['predicted'] = pred_labels
df_test['confidence'] = prediction



In [34]:
df_test

Unnamed: 0,twitter_handle,source,date,text,predicted,confidence
0,adani group,NYT,05/13/2022,the country has been hit hard by wildfires and...,0,0.294394
1,adani group,NYT,05/16/2022,the former federal reserve chair warns that th...,0,0.263658
2,adani group,NYT,06/14/2022,what the second bear market since the start of...,0,0.339724
3,adani group,NYT,09/22/2022,"alt news, an independent website, has emerged ...",0,0.135050
4,adani group,NYT,10/28/2022,the business decisions of gautam adani could g...,0,0.222312
...,...,...,...,...,...,...
9282,fia,NYT,10/31/2021,trading in your gas-busting vehicle for one wi...,1,0.960280
9283,fia,NYT,12/15/2021,max verstappen knew his chances of beating lew...,0,0.112936
9284,fia,NYT,01/13/2022,"from our critics, reviews of closed gallery sh...",0,0.247381
9285,fia,NYT,08/16/2022,a dominican restaurant in the radio hotel in w...,0,0.365250


In [35]:
df_test['date'] = pd.to_datetime(df_test['date'], format='%m/%d/%Y')

# Convert back to string in desired format
df_test['date'] = df_test['date'].dt.strftime('%Y-%m-%d')

In [36]:
df_test

Unnamed: 0,twitter_handle,source,date,text,predicted,confidence
0,adani group,NYT,2022-05-13,the country has been hit hard by wildfires and...,0,0.294394
1,adani group,NYT,2022-05-16,the former federal reserve chair warns that th...,0,0.263658
2,adani group,NYT,2022-06-14,what the second bear market since the start of...,0,0.339724
3,adani group,NYT,2022-09-22,"alt news, an independent website, has emerged ...",0,0.135050
4,adani group,NYT,2022-10-28,the business decisions of gautam adani could g...,0,0.222312
...,...,...,...,...,...,...
9282,fia,NYT,2021-10-31,trading in your gas-busting vehicle for one wi...,1,0.960280
9283,fia,NYT,2021-12-15,max verstappen knew his chances of beating lew...,0,0.112936
9284,fia,NYT,2022-01-13,"from our critics, reviews of closed gallery sh...",0,0.247381
9285,fia,NYT,2022-08-16,a dominican restaurant in the radio hotel in w...,0,0.365250


In [37]:
df_test.twitter_handle.unique()

array(['adani group', 'ftx', 'microsoft', 'google', 'air canada',
       'amazon', 'apple ', 'samsung', 'meta', 'intel', 'bose', 'fia'],
      dtype=object)

In [38]:
df_test['twitter_handle'] = df_test['twitter_handle'].replace(['air canada'], 'AirCanada')
df_test['twitter_handle'] = df_test['twitter_handle'].replace(['microsoft'], 'Microsoft')
df_test['twitter_handle'] = df_test['twitter_handle'].replace(['meta'], 'Meta')
df_test['twitter_handle'] = df_test['twitter_handle'].replace(['google'], 'Google')

In [39]:
df_test.to_csv('results/nyt_lstm_results.csv')