In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from keras.optimizers import Adam

In [27]:

reddit_df = pd.read_csv('input/reddit_opinion_climate_change.csv',index_col=['comment_id'], parse_dates=['created_time', 'post_created_time'])
reddit_df.head()

Unnamed: 0_level_0,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,user_is_verified,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
l1ytazo,1,"This is the major one, people keep downplaying...",climatechange,2024-04-30 17:15:22,1cgmk4u,rednib,0,1,0,True,...,157.0,28686.0,29023.0,88,There’s a lot of different impacts of climate ...,What about climate change worries you the most...,0.82,88,0,2024-04-30 07:02:19
l1yt78f,1,Totally agree. They profited off of our pollut...,conspiracy,2024-04-30 17:14:46,1cgqzo1,quiksilver10152,0,1,0,True,...,1002.0,5513.0,6586.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09
l1yt2mc,1,I honestly believe the truth always lies somew...,conspiracy,2024-04-30 17:14:01,1cgqzo1,eco78,0,1,0,True,...,772.0,33795.0,34775.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09
l1yt0bm,1,"lol, Zuck &amp; the other billionaires are jus...",climatechange,2024-04-30 17:13:39,1cgmk4u,rednib,0,1,0,True,...,157.0,28686.0,29023.0,88,There’s a lot of different impacts of climate ...,What about climate change worries you the most...,0.82,88,0,2024-04-30 07:02:19
l1yszhw,1,&gt; Global average surface air temps exceeded...,conspiracy,2024-04-30 17:13:31,1cgqzo1,Steve-lrwin,0,1,0,True,...,13609.0,30294.0,43903.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09


In [28]:
df_subreddits = reddit_df['subreddit'].unique()
print(df_subreddits)

['climatechange' 'conspiracy' 'climate' 'climateskeptics' 'science'
 'ClimateShitposting' 'worldnews' 'Futurology' 'europe' 'energy'
 'environment' 'changemyview' 'ClimateActionPlan' 'news'
 'EverythingScience' 'politics' 'canada' 'GlobalClimateChange'
 'unitedkingdom' 'ClimateOffensive' 'ClimateMemes' 'CitizensClimateLobby'
 'ClimateCO' 'Climate_Nuremberg' 'climate_discussion' 'climate_science']


In [29]:
# filter reddit data to include a few interesting subreddits and limit amount of reddit data
filtered_reddit_df = reddit_df.loc[reddit_df['subreddit'].isin(['climatechange', 'climate', 'climateskeptics'])]

In [30]:
# get the text data for the comment and clean up anything missing
reddit_text_data = filtered_reddit_df.self_text
# remove potential nulls
reddit_text_data = reddit_text_data.fillna('')
# Convert all entries to strings (in case there are any non-string entries)
reddit_text_data = reddit_text_data.astype(str)


In [31]:
twitter_df = pd.read_csv('input/twitter_sentiment_data.csv')
twitter_df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [32]:
# replacing -1 with 3 for LSTM are not working with negative values
twitter_df['sentiment'] = twitter_df['sentiment'].replace(-1, 3)


In [33]:
twitter_text_data = twitter_df['message']
twitter_text_data.head()

0    @tiniebeany climate change is an interesting h...
1    RT @NatGeoChannel: Watch #BeforeTheFlood right...
2    Fabulous! Leonardo #DiCaprio's film on #climat...
3    RT @Mick_Fanning: Just watched this amazing do...
4    RT @cnalive: Pranita Biswasi, a Lutheran from ...
Name: message, dtype: object

In [36]:
reddit_text_data.head()

comment_id
l1ytazo    This is the major one, people keep downplaying...
l1yt0bm    lol, Zuck &amp; the other billionaires are jus...
l1ysptw                   Laughable at this point really ...
l1ysmdg    Electricity is a utility; it's not a standard ...
l1ysiva    # Tesla Model S emits more lifetime CO2 in US ...
Name: self_text, dtype: object

In [37]:
# Combine the Reddit and Twitter data
combined_text_data = pd.concat([reddit_text_data, twitter_text_data], ignore_index=True)

In [38]:
#Tokenization -- fit the tokenizer on the combined twitter data and reddit data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_text_data)

In [39]:
#Tokenization -- converting the tweets into numerical tokens that can be processed by the model
sequences = tokenizer.texts_to_sequences(twitter_text_data)
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

Found 160174 unique tokens.


In [40]:
# Padding sequences
max_sequence_length = 100  # Max sequence length (you may adjust this based on your data)
data = pad_sequences(sequences, maxlen=max_sequence_length)


In [41]:
labels = twitter_df['sentiment']

In [42]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [43]:
# Define LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(len(word_index) + 1, 128, input_length=max_sequence_length))
lstm_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(4, activation='softmax'))


In [44]:
# Compile LSTM model
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [45]:
# Train LSTM model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x142a9a210>

In [None]:
# Evaluate the model on the test data
loss, accuracy = lstm_model.evaluate(X_test, y_test)

# Print the test loss and accuracy
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 1.3376867771148682
Test Accuracy: 0.7081578969955444


In [None]:
# Convert reddit_text_data to sequences
reddit_sequences = tokenizer.texts_to_sequences(reddit_text_data)

reddit_data_prediction = pad_sequences(reddit_sequences, maxlen=max_sequence_length)

# Check if any indices are out of range
embedding_layer = lstm_model.layers[0]
vocab_size = embedding_layer.get_weights()[0].shape[0]
if reddit_data_prediction.max() >= vocab_size:
    print(f"Error: Found word index {reddit_data_prediction.max()} but the model's vocabulary size is {vocab_size}")

# Predict
predictions = lstm_model.predict(reddit_data_prediction)
# type(predictions)


Error: Found word index 160174 but the model's vocabulary size is 81449


2024-05-14 19:21:01.579803: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: indices[31,99] = 114950 is not in [0, 81449)
	 [[{{node sequential/embedding/embedding_lookup}}]]


InvalidArgumentError: Graph execution error:

Detected at node 'sequential/embedding/embedding_lookup' defined at (most recent call last):
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
      app.start()
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
      self.io_loop.start()
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "/opt/anaconda3/envs/tf/lib/python3.11/asyncio/base_events.py", line 607, in run_forever
      self._run_once()
    File "/opt/anaconda3/envs/tf/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once
      handle._run()
    File "/opt/anaconda3/envs/tf/lib/python3.11/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
      await result
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 359, in execute_request
      await super().execute_request(stream, ident, parent)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 446, in do_execute
      res = shell.run_cell(
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
      result = self._run_cell(
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
      result = runner(coro)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/jt/_ss6h9nd2l7g209tcwt1fr6h0000gn/T/ipykernel_73659/2971589738.py", line 9, in <module>
      predictions = lstm_model.predict(reddit_data_prediction)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/training.py", line 2382, in predict
      tmp_batch_outputs = self.predict_function(iterator)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/training.py", line 2169, in predict_function
      return step_function(self, iterator)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/training.py", line 2155, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/training.py", line 2143, in run_step
      outputs = model.predict_step(data)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/training.py", line 2111, in predict_step
      return self(x, training=False)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/training.py", line 558, in __call__
      return super().__call__(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/sequential.py", line 412, in call
      return super().call(inputs, training=training, mask=mask)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/functional.py", line 512, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/functional.py", line 669, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/opt/anaconda3/envs/tf/lib/python3.11/site-packages/keras/layers/core/embedding.py", line 272, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'sequential/embedding/embedding_lookup'
indices[31,99] = 114950 is not in [0, 81449)
	 [[{{node sequential/embedding/embedding_lookup}}]] [Op:__inference_predict_function_19850]

In [None]:
# Convert predictions to class labels
predicted_labels = np.argmax(predictions, axis=1)