In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import Adam
from sklearn.metrics import f1_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

2024-05-17 16:36:29.658248: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

reddit_df = pd.read_csv('input/reddit_opinion_climate_change.csv',index_col=['comment_id'], parse_dates=['created_time', 'post_created_time'])
reddit_df.head()

Unnamed: 0_level_0,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,user_is_verified,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
l1ytazo,1,"This is the major one, people keep downplaying...",climatechange,2024-04-30 17:15:22,1cgmk4u,rednib,0,1,0,True,...,157.0,28686.0,29023.0,88,There’s a lot of different impacts of climate ...,What about climate change worries you the most...,0.82,88,0,2024-04-30 07:02:19
l1yt78f,1,Totally agree. They profited off of our pollut...,conspiracy,2024-04-30 17:14:46,1cgqzo1,quiksilver10152,0,1,0,True,...,1002.0,5513.0,6586.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09
l1yt2mc,1,I honestly believe the truth always lies somew...,conspiracy,2024-04-30 17:14:01,1cgqzo1,eco78,0,1,0,True,...,772.0,33795.0,34775.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09
l1yt0bm,1,"lol, Zuck &amp; the other billionaires are jus...",climatechange,2024-04-30 17:13:39,1cgmk4u,rednib,0,1,0,True,...,157.0,28686.0,29023.0,88,There’s a lot of different impacts of climate ...,What about climate change worries you the most...,0.82,88,0,2024-04-30 07:02:19
l1yszhw,1,&gt; Global average surface air temps exceeded...,conspiracy,2024-04-30 17:13:31,1cgqzo1,Steve-lrwin,0,1,0,True,...,13609.0,30294.0,43903.0,92,,60 years of Failed Climate Change prediction. ...,0.7,92,0,2024-04-30 11:56:09


In [3]:
df_subreddits = reddit_df['subreddit'].unique()
print(df_subreddits)

['climatechange' 'conspiracy' 'climate' 'climateskeptics' 'science'
 'ClimateShitposting' 'worldnews' 'Futurology' 'europe' 'energy'
 'environment' 'changemyview' 'ClimateActionPlan' 'news'
 'EverythingScience' 'politics' 'canada' 'GlobalClimateChange'
 'unitedkingdom' 'ClimateOffensive' 'ClimateMemes' 'CitizensClimateLobby'
 'ClimateCO' 'Climate_Nuremberg' 'climate_discussion' 'climate_science']


In [4]:
# filter reddit data to include a few interesting subreddits and limit amount of reddit data
filtered_reddit_df = reddit_df.loc[reddit_df['subreddit'].isin(['climatechange', 'climate', 'climateskeptics'])]

In [5]:
# get the text data for the comment and clean up anything missing
reddit_text_data = filtered_reddit_df.self_text
# remove potential nulls
reddit_text_data = reddit_text_data.fillna('')
# Convert all entries to strings (in case there are any non-string entries)
reddit_text_data = reddit_text_data.astype(str)


In [6]:
twitter_df = pd.read_csv('input/twitter_sentiment_data.csv')
twitter_df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [7]:
# replacing -1 with 3 for LSTM are not working with negative values
twitter_df['sentiment'] = twitter_df['sentiment'].replace(-1, 3)


In [8]:
twitter_text_data = twitter_df['message']
twitter_text_data.head()

0    @tiniebeany climate change is an interesting h...
1    RT @NatGeoChannel: Watch #BeforeTheFlood right...
2    Fabulous! Leonardo #DiCaprio's film on #climat...
3    RT @Mick_Fanning: Just watched this amazing do...
4    RT @cnalive: Pranita Biswasi, a Lutheran from ...
Name: message, dtype: object

In [9]:
reddit_text_data.head()

comment_id
l1ytazo    This is the major one, people keep downplaying...
l1yt0bm    lol, Zuck &amp; the other billionaires are jus...
l1ysptw                   Laughable at this point really ...
l1ysmdg    Electricity is a utility; it's not a standard ...
l1ysiva    # Tesla Model S emits more lifetime CO2 in US ...
Name: self_text, dtype: object

In [10]:
# Combine the Reddit and Twitter data
combined_text_data = pd.concat([reddit_text_data, twitter_text_data], ignore_index=True)

In [11]:
#Tokenization -- fit the tokenizer on the combined twitter data and reddit data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_text_data)

In [12]:
#Tokenization -- converting the tweets into numerical tokens that can be processed by the model
sequences = tokenizer.texts_to_sequences(twitter_text_data)
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

Found 160174 unique tokens.


In [13]:
# Padding sequences
max_sequence_length = 100  # Max sequence length (you may adjust this based on your data)
data = pad_sequences(sequences, maxlen=max_sequence_length)


In [14]:
labels = twitter_df['sentiment']

In [15]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [16]:
# Define LSTM model
def create_lstm_model():
    lstm_model = Sequential()
    lstm_model.add(Embedding(len(word_index) + 1, 128, input_length=max_sequence_length))
    lstm_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    lstm_model.add(Dense(4, activation='softmax'))
    lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return lstm_model


In [17]:
# Compile LSTM model
lstm_model = create_lstm_model()

In [18]:
# Train LSTM model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13f2b8090>

In [19]:
# Evaluate the model on the test data
loss, accuracy = lstm_model.evaluate(X_test, y_test)

# Print the test loss and accuracy
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 1.4012432098388672
Test Accuracy: 0.695528507232666


In [20]:
# Assuming y_test are your true labels and y_pred are your predicted labels
y_pred = np.argmax(lstm_model.predict(X_test), axis=-1)  # get the class with highest probability
f1 = f1_score(y_test, y_pred, average='macro')  # use 'micro' or 'weighted' depending on your problem
print('F1 Score: ', f1)

F1 Score:  0.6464028065669452


In [21]:
# Create model
model = KerasClassifier(build_fn=create_lstm_model, epochs=10, batch_size=128, verbose=0)

# Evaluate using 5-fold cross validation
results = cross_val_score(model, X_train, y_train, cv=5)
print('Cross Validation Score: ', results.mean())

  model = KerasClassifier(build_fn=create_lstm_model, epochs=10, batch_size=128, verbose=0)


In [None]:
# Convert reddit_text_data to sequences
reddit_sequences = tokenizer.texts_to_sequences(reddit_text_data)

reddit_data_prediction = pad_sequences(reddit_sequences, maxlen=max_sequence_length)

# Check if any indices are out of range
embedding_layer = lstm_model.layers[0]
vocab_size = embedding_layer.get_weights()[0].shape[0]
if reddit_data_prediction.max() >= vocab_size:
    print(f"Error: Found word index {reddit_data_prediction.max()} but the model's vocabulary size is {vocab_size}")

# Predict
predictions = lstm_model.predict(reddit_data_prediction)




In [None]:
#Flatten the predictions
# flat_predictions = np.concatenate(predictions).ravel()

# # Convert the predictions to a Series
# predictions_series = pd.Series(flat_predictions)


# # Add the predictions as a new column to the DataFrame
# reddit_text_data['Predicted_Labels'] = predictions_series
# reddit_text_data.head()