In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from keras.optimizers import Adam

In [30]:

df = pd.read_csv('input/reddit_opinion_climate_change.csv',index_col=['comment_id'], parse_dates=['created_time', 'post_created_time'])
df.head()

Unnamed: 0_level_0,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,user_is_verified,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
l1ytazo,1,"This is the major one, people keep downplaying...",climatechange,2024-04-30 17:15:22,1cgmk4u,rednib,0,1,0,True,...,157.0,28686.0,29023.0,88,There’s a lot of different impacts of climate ...,What about climate change worries you the most...,0.82,88,0,2024-04-30 07:02:19
l1yt78f,1,Totally agree. They profited off of our pollut...,conspiracy,2024-04-30 17:14:46,1cgqzo1,quiksilver10152,0,1,0,True,...,1002.0,5513.0,6586.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09
l1yt2mc,1,I honestly believe the truth always lies somew...,conspiracy,2024-04-30 17:14:01,1cgqzo1,eco78,0,1,0,True,...,772.0,33795.0,34775.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09
l1yt0bm,1,"lol, Zuck &amp; the other billionaires are jus...",climatechange,2024-04-30 17:13:39,1cgmk4u,rednib,0,1,0,True,...,157.0,28686.0,29023.0,88,There’s a lot of different impacts of climate ...,What about climate change worries you the most...,0.82,88,0,2024-04-30 07:02:19
l1yszhw,1,&gt; Global average surface air temps exceeded...,conspiracy,2024-04-30 17:13:31,1cgqzo1,Steve-lrwin,0,1,0,True,...,13609.0,30294.0,43903.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09


In [31]:
# Assuming df is your Pandas DataFrame
values_to_filter = ['climate', 'climatechange', 'climateskeptics']
filtered_df = df[df['subreddit'].isin(values_to_filter)]
filtered_df.count()

score                         156855
self_text                     156853
subreddit                     156855
created_time                  156855
post_id                       156855
author_name                   156855
controversiality              156855
ups                           156855
downs                         156855
user_is_verified              156855
user_account_created_time     152579
user_awardee_karma            156844
user_awarder_karma            156844
user_link_karma               156844
user_comment_karma            156844
user_total_karma              156844
post_score                    156855
post_self_text                 71104
post_title                    156855
post_upvote_ratio             156855
post_thumbs_ups               156855
post_total_awards_received    156855
post_created_time             156855
dtype: int64

In [32]:
df_subreddits = df['subreddit'].unique()
print(df_subreddits)

['climatechange' 'conspiracy' 'climate' 'climateskeptics' 'science'
 'ClimateShitposting' 'worldnews' 'Futurology' 'europe' 'energy'
 'environment' 'changemyview' 'ClimateActionPlan' 'news'
 'EverythingScience' 'politics' 'canada' 'GlobalClimateChange'
 'unitedkingdom' 'ClimateOffensive' 'ClimateMemes' 'CitizensClimateLobby'
 'ClimateCO' 'Climate_Nuremberg' 'climate_discussion' 'climate_science']


In [33]:
twitter_df = pd.read_csv('input/twitter_sentiment_data.csv')
twitter_df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [34]:
# replacing -1 with 3 for LSTM are not working with negative values
twitter_df['sentiment'] = twitter_df['sentiment'].replace(-1, 3)


In [35]:
text_data = twitter_df['message']
text_data.head()
print(text_data.shape)

(43943,)


In [36]:
#Tokenization -- converting the tweets into numerical tokens that can be processed by the model
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

print(text_data.iloc[1])
print(sequences[1])
print(len(sequences))

Found 81448 unique tokens.
RT @NatGeoChannel: Watch #BeforeTheFlood right here, as @LeoDiCaprio travels the world to tackle climate change https://t.co/LkDehj3tNn httÃ¢â‚¬Â¦
[6, 378, 173, 271, 112, 119, 39, 293, 553, 7, 49, 8, 195, 1, 2, 4, 3, 5, 575, 444]
43943


In [37]:
# Padding sequences
max_sequence_length = 100  # Max sequence length (you may adjust this based on your data)
data = pad_sequences(sequences, maxlen=max_sequence_length)


In [38]:
labels = twitter_df['sentiment']

In [39]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [40]:
# Define LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(len(word_index) + 1, 128, input_length=max_sequence_length))
lstm_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(4, activation='softmax'))


In [41]:
# Compile LSTM model
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [42]:
# Train LSTM model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1329d5cd0>

In [43]:
# Evaluate the model on the test data
loss, accuracy = lstm_model.evaluate(X_test, y_test)

# Print the test loss and accuracy
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 1.539385437965393
Test Accuracy: 0.7100921869277954
