In [None]:
# Dependencies
!pip install praw
!pip install nltk
!pip install transformers
!pip install tqdm

In [None]:
# Imports

#get data
import praw
import webbrowser
import os

#data manipulation
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

#data pre processing
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from operator import index
from transformers import pipeline

#model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from google.colab import drive

#save model and tokenizer
from tensorflow.keras.models import load_model
import pickle

In [None]:
#credential removed for safety

# client_id = "id"
# client_secret = "secret"
# user_agent = "user"
# redirect_uri = "http://localhost:8080"

In [None]:
# reddit object
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    redirect_uri=redirect_uri
)

auth_url = reddit.auth.url(scopes=["identity", "read"], state="random_string") # "identity" para acessar informações do usuário, "read" para ler dados públicos
print(f"Abra este URL no seu navegador para autorizar o acesso: {auth_url}")

webbrowser.open(auth_url) # Isso abrirá o URL no seu navegador


In [None]:
authorization_code = "iiQ98BPletKkukTmWJ8H1og4xyOD-w"

In [None]:
try:
  reddit.auth.authorize(authorization_code)
  print("Autorização bem-sucedida!")
except Exception as e:
  print(f"Erro na autorização: {e}")

In [None]:
subreddit_name = "climatechange"
search_terms = [
    "climate change",
    "global warming",
    "climate crisis",
    "climate emergency",
    "environment",
    "sustainability",
    "ecology",
    "climate",
    "planet",
    "atmosphere",
    "greenhouse effect",
    "carbon emissions",
    "greenhouse gases",
    "fossil fuels",
    "oil",
    "coal",
    "natural gas",
    "deforestation",
    "wildfires",
    "agriculture methane emissions",
    "industrialization",
    "overconsumption",
    "extreme weather events",
    "drought",
    "flooding",
    "storm",
    "hurricane",
    "cyclone",
    "heatwave",
    "forest fires",
    "sea level rise",
    "melting glaciers",
    "biodiversity loss",
    "species extinction",
    "ocean acidification",
    "food security",
    "water scarcity",
    "climate migration",
    "health impacts climate change",
    "renewable energy",
    "solar power",
    "wind power",
    "hydroelectric power",
    "geothermal energy",
    "biomass energy",
    "energy transition",
    "decarbonization",
    "carbon neutrality",
    "circular economy",
    "energy efficiency",
    "carbon capture",
    "reforestation",
    "sustainable agriculture",
    "electric mobility",
    "green hydrogen",
    "Paris Agreement",
    "COP",
    "COP26",
    "COP27",
    "COP28",
    "COP29",
    "COP30",
    "climate policies",
    "environmental legislation",
    "climate activism",
    "climate justice",
    "climate action",
    "government climate change",
    "companies sustainability",
    "NGOs environment",
    "IPCC reports",
    "climate skepticism",
    "climate denial",
    "climate misinformation",
    "climate conspiracy theories",
    "#ClimateChange",
    "#GlobalWarming",
    "#ClimateCrisis",
    "#ClimateEmergency",
    "#Environment",
    "#Sustainability",
    "#Ecology",
    "#Climate",
    "#ActOnClimate",
    "#SaveOurPlanet",
    "#GreenNewDeal",
    "#RenewableEnergy",
    "#NetZero",
    "#FridaysForFuture",
    "#ExtinctionRebellion",
    "#ClimateJustice"
]
limit = 10  # amount limit

collected_posts = []

try:
    subreddit = reddit.subreddit(subreddit_name)
    for term in search_terms:
        print(f"Searching for: '{term}' in r/{subreddit_name}")
        for submission in subreddit.search(query=term, sort="relevance", limit=limit):
            post_data = {
                "id": submission.id,
                "title": submission.title,
                "author": str(submission.author),
                "url": submission.url,
                "created_utc": submission.created_utc,
                "selftext": submission.selftext, # post content (if its a text post)
                "subreddit": subreddit_name,
                "search_term": term
            }
            collected_posts.append(post_data)
        print(f"Collected {len(collected_posts)} posts until now.")

except Exception as e:
    print(f"Error during collect: {e}")

print("\nCollected data:")
for post in collected_posts:
    print(f"ID: {post['id']}, Título: {post['title']}")

In [None]:
# convert dictionary into a dataframe
df = pd.DataFrame(collected_posts)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
print(df['search_term'].value_counts())

In [None]:
# drop duplicates posts
df.drop_duplicates(subset='id', keep='first', inplace=True) # Remove duplicados com base no ID único do post

In [None]:
# fillna in "selftext", avoiding "combination problems" --> "tile" + "selftext"
df['selftext'].fillna('', inplace=True)

In [None]:
df['combined_text_title&selftext'] = df['title'] + '' + df['selftext']

In [None]:
df['combined_text_title&selftext'] = df['combined_text_title&selftext'].str.lower()

In [None]:
# removing characters different from: numbers, letters or blanks
df['combined_text_title&selftext'] = df['combined_text_title&selftext'].apply(lambda x: re.sub(r'[^a-z0-9\s]]', '', x))

In [None]:
# words tokenize
nltk.download('punkt_tab')
df['tokens'] = df['combined_text_title&selftext'].apply(word_tokenize)

In [None]:
# removing "stopwords" (word without significant meaning)
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
df['tokens_cleaned'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [None]:
# lemmatizing tokens (reducing words into their base form)
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
df['lemmatized_tokens'] = df['tokens_cleaned'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

# EMBENDDINGS

In [None]:
# downloading embeddings files
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d embeddings

embeddings_index = {}
with open('embeddings/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embeddings_index[word] = coefs

print(f'Found {len(embeddings_index)} word vectors.')

In [None]:
# create a vocabulary from lematized tokens
tokenizer = Tokenizer(num_words = None) # maintain all the words
tokenizer.fit_on_texts(df['lemmatized_tokens'].apply(lambda x: ' '.join(x)))
word_index = tokenizer.word_index
print(f'Vocabulary length: {len(word_index)}')

# define max length of sequence for padding
MAX_SEQUENCE_LENGTH = max(df['lemmatized_tokens'].apply(len))

# create embeddings matrix
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # not found words in Glove ll be vectors of zeros
        embedding_matrix[i] = embedding_vector

print(f'Shape of embedding matrix: {embedding_matrix.shape}')

In [None]:
# convert tokens on number sequences and applying padding (garantee all the sequences to have the same size)
sequences = tokenizer.texts_to_sequences(df['lemmatized_tokens'].apply(lambda x: ' '.join(x)))
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

print(f'Shape of padded sequences: {padded_sequences.shape}')

# LSTM Model

In [None]:
# LSTM model
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 1124

model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix], trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Labeling database

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis")

In [None]:
# labeling

#loading sentiment-analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def get_sentiment(text):
  try:
    # truncating text for max length the model can work with
    max_length = sentiment_pipeline.tokenizer.model_max_length - 2
    truncated_text = sentiment_pipeline.tokenizer.decode(
        sentiment_pipeline.tokenizer.encode(text, truncation=True, max_length=max_length),
        skip_special_tokens=True
    )
    result = sentiment_pipeline(truncated_text)[0]
    return result['label'], result['score']
  except Exception as e:
    print(f"Error classifying text: {e}")
    return None, None

tqdm.pandas()

sentiment_results = df['combined_text_title&selftext'].progress_apply(get_sentiment)

df[['sentiment_label', 'sentiment_score']]= pd.DataFrame(sentiment_results.tolist(), index=df.index)

print(df[['title', 'combined_text_title&selftext', 'sentiment_label', 'sentiment_score']].head())

In [None]:
# converting labels to binarie format
sentiment_mapping = {'POSITIVE':1, 'NEGATIVE':0}
df['sentiment_label_encoded'] = df['sentiment_label'].map(sentiment_mapping)



#getting index of remaining lines
# getting index of remaining lines
# The issue was here. `df.index` was holding old index values,
# even after rows were dropped from the DataFrame.
# Resetting the index ensures the index values are in sync with the DataFrame's current state.
indexes = df.reset_index(drop=True).index

# converting into numpy arrays for Keras
X = np.array(padded_sequences)[indexes]
y = np.array(df['sentiment_label_encoded'])

# splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

# LSTM Model Training

In [None]:
epochs = 10
batch_size = 32

history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss : {loss:.4f}')
print(f'Accuracy: {accuracy:.4f}')

In [None]:
#saving data frame

#naming directory
data_path = 'data'

if not os.path.exists(data_path):
    os.makedirs(data_path)
    print(f"Directory '{data_path}' created successfully.")
else:
    print(f"Directory '{data_path}' already exists.")

#saving
df.to_csv('data/df_labeled.csv', index=False)
print('DataFrame saved successfully.')

In [None]:
# saving model and tokenizer
model_tokenizer_Path = 'model'

if not os.path.exists(model_tokenizer_Path):
    os.makedirs(model_tokenizer_Path)
    print(f"Directory '{model_tokenizer_Path}' created successfully.")
else:
    print(f"Directory '{model_tokenizer_Path}' already exists.")

#model
try:
    model.save('model/sentiment_model.keras')
    print("Model saved successfully.")
except Exception as e:
    print(f"Error saving the model: {e}")

#tokenizer
try:
    with open('model/tokenizer.pickle', 'wb') as handle:
      pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
      print('Tokenizer saved successfully.')
except Exception as e:
    print(f"Error saving the tokenizer: {e}")