In [1]:
import torch
import json
import pandas as pd
import logging

In [2]:
logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(levelname)s - %(message)s')

In [3]:
# import nbformat
# from nbconvert import PythonExporter

# with open('model_training.ipynb', 'r', encoding='utf-8') as f:
#     nb_content = nbformat.read(f, as_version=4)

# exporter = PythonExporter()

# script, _ = exporter.from_notebook_node(nb_content)

# with open('model_training.py', 'w', encoding='utf-8') as f:
#     f.write(script)


In [4]:
try:
    with open('hyperparameters.json', 'r') as f:
        hyperparameters = json.load(f)

except FileNotFoundError:
    logging.info('file not found')
    raise

except json.JSONDecodeError:
    logging.info('json decode error')
    raise

In [5]:
from model_training import EmbeddingClassifier

In [6]:
input_size = 768
hidden_size = 225
output_size = 2  


model = EmbeddingClassifier(input_size=input_size, hidden_size=hidden_size, output_size=output_size)
model.load_state_dict(torch.load('best_model.pth'))
model.eval() 

  model.load_state_dict(torch.load('best_model.pth'))


EmbeddingClassifier(
  (fc1): Linear(in_features=768, out_features=225, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=225, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

In [7]:
import numpy as np

df = pd.read_csv('df0_with_embeddings.csv')
def convert_to_array(embedding_str):
    embedding_list = embedding_str.replace('[', '').replace(']', '').split()
    return np.array(embedding_list, dtype=float)

df['embeddings'] = df['embeddings'].apply(convert_to_array)

In [8]:
# single_embedding = df['embeddings'][0]
# single_embedding_tensor = torch.tensor(single_embedding).unsqueeze(0).float()  # Convert to float32

# model.eval()

# with torch.no_grad():
#     output = model(single_embedding_tensor)
#     probabilities = torch.softmax(output, dim=1)
#     predicted_class = torch.argmax(probabilities, dim=1)

# predicted_class = predicted_class.item()
# print(f"Predicted class for the first embedding: {predicted_class}")


In [9]:
import logging
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import unicodedata
import contractions
from nltk import pos_tag

class TextPreprocessor:
    def __init__(self, model_name='yiyanghkust/finbert-tone', custom_stopwords=None):
        self.custom_stopwords = set(custom_stopwords) if custom_stopwords else set([
            'a', 'an', 'the', 'and', 'but', 'or', 'if', 'because', 'as', 'while', 
            'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
            'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 
            'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 
            'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 
            'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 
            'too', 'very'
        ])
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()

    def lowercase_text(self, text):
        def process_word(word):
            if 2 <= len(word) <= 10 and word.isupper():
                return word
            else:
                return word.lower()
        
        words_in_text = text.split()
        processed_words = [process_word(word) for word in words_in_text]
        
        return ' '.join(processed_words)

    def remove_punctuation(self, text, keep_punctuation=None):
        if keep_punctuation is None:
            keep_punctuation = {'.', ',', ':', '-', "'", '(', ')', '/', '?', '!'}
            
        text = re.sub(r'[\n\t]', ' ', text)
        pattern = r'[^\w\s' + ''.join(re.escape(p) for p in keep_punctuation) + r']'
        text = re.sub(pattern, ' ', text)
        return text

    def remove_html_urls(self, s):
        try:
            if '<' in s and '>' in s:
                soup = BeautifulSoup(s, 'html.parser')
                cleaned_text = soup.get_text(separator=' ')
            else:
                cleaned_text = s

            url_pattern = re.compile(r'https?://\S+|www\.\S+')
            cleaned_text = re.sub(url_pattern, '', cleaned_text)

            return cleaned_text
        except Exception as e:
            logging.error(f"Error in remove_html_urls: {str(e)}")
            return s

    def expand_contractions(self, text):
        try:
            expanded_text = contractions.fix(text)
            return expanded_text
        except Exception as e:
            logging.error(f"Error in expand_contractions: {str(e)}")
            return text

    def remove_extra_whitespace(self, text):
        try:
            text = re.sub(r'\s+', ' ', text).strip()
            return text
        except Exception as e:
            logging.error(f"Error in remove_extra_whitespace: {str(e)}")
            return text

    def remove_stopwords(self, text):
        words = word_tokenize(text)
        filtered_words = [word for word in words if word.lower() not in self.custom_stopwords]
        return ' '.join(filtered_words)

    def normalize_text(self, text):
        try:
            normalized_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
            return normalized_text
        except Exception as e:
            logging.error(f"Error in normalize_text: {str(e)}")
            return text

    def lemmatizer_with_pos(self, s):
        try:
            words = word_tokenize(s)
            tagged_words = pos_tag(words)
            lemmatized_words = []
            
            for word, tag in tagged_words:
                wn_pos = self.get_wordnet_pos(tag) or 'n'
                lemmatized_word = self.lemmatizer.lemmatize(word, pos=wn_pos)
                lemmatized_words.append(lemmatized_word)
            
            return ' '.join(lemmatized_words)
        except Exception as e:
            logging.error(f"Error in lemmatizer_with_pos: {str(e)}")
            return s

    def get_wordnet_pos(self, tag):
        if tag.startswith('J'):
            return 'a'
        elif tag.startswith('V'):
            return 'v'
        elif tag.startswith('N'):
            return 'n'
        elif tag.startswith('R'):
            return 'r'
        else:
            return None

    def preprocess_text(self, text):
        try:
            if not text or text.strip() == '':
                return ''
            
            text = self.expand_contractions(text)
            text = self.remove_html_urls(text)
            text = self.remove_punctuation(text)
            text = self.remove_extra_whitespace(text)
            text = self.normalize_text(text)
            text = self.lemmatizer_with_pos(text)
            text = self.remove_stopwords(text)
            text = self.lowercase_text(text)
            
            return text
        except Exception as e:
            logging.error(f"Error during preprocessing: {str(e)}")
            return text

    def generate_embeddings(self, texts, batch_size=32, max_length=512):
        all_embeddings = []
        num_batches = (len(texts) + batch_size - 1) // batch_size 

        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
            try:
                batch_texts = texts[i:i + batch_size]
                logging.info(f"Processing batch {i // batch_size + 1} with {len(batch_texts)} texts.")

                inputs = self.tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=max_length)

                with torch.no_grad():
                    outputs = self.model(**inputs)

                batch_embeddings = outputs.last_hidden_state[:, 0, :]
                all_embeddings.append(batch_embeddings)

                logging.info(f"Batch {i // batch_size + 1} processed successfully.")
            
            except Exception as e:
                logging.error(f"Error processing batch {i // batch_size + 1}: {e}")

        all_embeddings = torch.cat(all_embeddings, dim=0)
        return all_embeddings

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    
    text_preprocessor = TextPreprocessor()
    sample_text = "The stock price of Apple Inc. (AAPL) is expected to rise in the next two years."
    
    processed_text = text_preprocessor.preprocess_text(sample_text)
    print(f"Processed Text: {processed_text}")
    
    texts = [sample_text]
    embeddings = text_preprocessor.generate_embeddings(texts)
    print(f"Embeddings Shape: {embeddings.shape}")




Processed Text: stock price apple inc. ( AAPL ) be expect rise next two year .


Processing batches:   0%|                                 | 0/1 [00:00<?, ?it/s]2024-10-01 12:49:41,532 - INFO - Processing batch 1 with 1 texts.
2024-10-01 12:49:42,057 - INFO - Batch 1 processed successfully.
Processing batches: 100%|█████████████████████████| 1/1 [00:00<00:00,  1.90it/s]

Embeddings Shape: torch.Size([1, 768])





In [10]:
# !pip install praw

In [11]:
# import json
# import os
# import praw
# import torch
# import logging


# BATCH_SIZE = 100
# MAX_ROWS = 1000


# def init_reddit_client():
#     try:
#         reddit = praw.Reddit(
#             client_id='CudIQ0m7vQerzS0wAyQzIA',
#             client_secret='llSZVoIpkbt2LAnBDIXtNeTreuCdzg',
#             user_agent='ubuntu:RedditScraper:v1.0 (by /u/Playful-Jellyfish834)'
#         )
#         logging.info("Initialized Reddit client successfully.")
#         return reddit
#     except Exception as e:
#         logging.error(f"Error initializing Reddit client: {e}")
#         raise


# def scrape_reddit_data(reddit, subreddit_names):
#     all_texts = []
#     dataset_file = 'reddit_data.json'


#     if os.path.exists(dataset_file):
#         try:
#             with open(dataset_file, 'r') as f:
#                 all_texts = json.load(f)
#                 logging.info(f"Loaded existing dataset with {len(all_texts)} texts.")
#         except Exception as e:
#             logging.error(f"Error loading dataset: {e}")


#     for subreddit_name in subreddit_names:
#         try:
#             subreddit = reddit.subreddit(subreddit_name)
#             logging.info(f"Scraping subreddit: {subreddit_name}")

#             # print('aaa')
#             for submission in subreddit.new(limit=BATCH_SIZE):
#                 if len(all_texts) >= MAX_ROWS:
#                     logging.info("Reached maximum rows limit.")
#                     break


#                 text = submission.title + " " + submission.selftext
#                 all_texts.append(text)

#             print('bbbb')
#             for i in range(0, len(all_texts), BATCH_SIZE):
#                 if len(all_texts) >= MAX_ROWS:
#                     break
                
#                 batch_texts = all_texts[i:i + BATCH_SIZE]
#                 processed_texts = [text_preprocessor.preprocess_text(text) for text in batch_texts]

#                 embeddings = text_preprocessor.generate_embeddings(processed_texts)

#                 for idx, embedding in enumerate(embeddings):
#                     try:
#                         single_embedding_tensor = torch.tensor(embedding).unsqueeze(0).float()  # Convert to float32

#                         model.eval()
#                         with torch.no_grad():
#                             output = model(single_embedding_tensor)
#                             probabilities = torch.softmax(output, dim=1)
#                             predicted_class = torch.argmax(probabilities, dim=1).item()

#                         if predicted_class == 1:
#                             if len(all_texts) < MAX_ROWS:
#                                 relevant_text = processed_texts[idx]
#                                 all_texts[i + idx] = relevant_text 
#                                 logging.info(f"Stored relevant text from batch {i // BATCH_SIZE}: {relevant_text[:30]}...")

#                     except Exception as e:
#                         logging.error(f"Error processing embedding at index {idx}: {e}")

#             try:
#                 with open(dataset_file, 'w') as f:
#                     json.dump(all_texts[:MAX_ROWS], f) 
#                     logging.info(f"Saved dataset with {len(all_texts[:MAX_ROWS])} texts to {dataset_file}.")
#             except Exception as e:
#                 logging.error(f"Error saving dataset: {e}")

#         except Exception as e:
#             logging.error(f"Error scraping subreddit {subreddit_name}: {e}")

#     try:
#         with open(dataset_file, 'w') as f:
#             json.dump(all_texts[:MAX_ROWS], f)  # Ensure final save respects MAX_ROWS
#             logging.info(f"Final save completed. Total relevant texts stored: {len(all_texts[:MAX_ROWS])}")
#     except Exception as e:
#         logging.error(f"Error during final save: {e}")

#     return all_texts[:MAX_ROWS]



In [12]:
# !pip install --upgrade praw

In [13]:
import json
import os
import praw
import torch
import logging
import time
import prawcore

BATCH_SIZE = 100
MAX_ROWS = 1000


In [20]:
def init_reddit_client():
    try:
        reddit = praw.Reddit(
            client_id='CudIQ0m7vQerzS0wAyQzIA',
            client_secret='llSZVoIpkbt2LAnBDIXtNeTreuCdzg',
            user_agent='ubuntu:RedditScraper:v1.0 (by /u/Playful-Jellyfish834)'
        )
        logging.info("Initialized Reddit client successfully.")
        return reddit
    except Exception as e:
        logging.error(f"Error initializing Reddit client: {e}")
        raise

In [21]:
def load_existing_data(dataset_file='reddit_data.json'):
    all_texts = []
    
    if os.path.exists(dataset_file):
        try:
            with open(dataset_file, 'r') as f:
                all_texts = json.load(f)
                logging.info(f"Loaded existing dataset with {len(all_texts)} texts.")
        except Exception as e:
            logging.error(f"Error loading dataset: {e}")
    
    return all_texts

In [22]:
def save_data(all_texts, dataset_file='reddit_data.json'):
    try:
        with open(dataset_file, 'w') as f:
            json.dump(all_texts[:MAX_ROWS], f)
            logging.info(f"Saved dataset with {len(all_texts[:MAX_ROWS])} texts to {dataset_file}.")
    except Exception as e:
        logging.error(f"Error saving dataset: {e}")

In [23]:
def scrape_reddit_data(reddit, subreddit_names):
    all_texts = load_existing_data()

    for subreddit_name in subreddit_names:
        try:
            subreddit = reddit.subreddit(subreddit_name)
            logging.info(f"Scraping subreddit: {subreddit_name}")
            
            for submission in subreddit.hot(limit=BATCH_SIZE):
                if len(all_texts) >= MAX_ROWS:
                    logging.info("Reached maximum rows limit.")
                    break

                text = submission.title + " " + submission.selftext

                submission.comments.replace_more(limit=0)
                best_reply = ""
                if submission.comments:
                    best_reply = submission.comments[0].body  # Assuming first comment is the best

                all_texts.append({"text": text, "best_reply": best_reply})

            # Process the batches of texts and best replies
            process_batches(all_texts)

            save_data(all_texts)
            time.sleep(2)

        except prawcore.exceptions.NotFound:
            logging.error(f"Subreddit {subreddit_name} not found or inaccessible.")
        except prawcore.exceptions.Forbidden:
            logging.error(f"Access to subreddit {subreddit_name} is forbidden.")
        except prawcore.exceptions.RequestException as e:
            logging.error(f"Request error while accessing {subreddit_name}: {e}")
        except Exception as e:
            logging.error(f"Error scraping subreddit {subreddit_name}: {e}")

    save_data(all_texts)

    return all_texts[:MAX_ROWS]

In [24]:
def process_batches(all_texts):
    for i in range(0, len(all_texts), BATCH_SIZE):
        if len(all_texts) >= MAX_ROWS:
            break

        batch_texts = all_texts[i:i + BATCH_SIZE]
        processed_texts = [text_preprocessor.preprocess_text(item["text"]) for item in batch_texts]
        processed_replies = [text_preprocessor.preprocess_text(item["best_reply"]) for item in batch_texts]

        # Generate embeddings for both texts and replies
        text_embeddings = text_preprocessor.generate_embeddings(processed_texts)
        reply_embeddings = text_preprocessor.generate_embeddings(processed_replies)

        for idx, (text_embedding, reply_embedding) in enumerate(zip(text_embeddings, reply_embeddings)):
            try:
                # Process the text embedding
                single_text_embedding = torch.tensor(text_embedding).unsqueeze(0).float()  # Convert to float32
                model.eval()
                with torch.no_grad():
                    text_output = model(single_text_embedding)
                    text_probabilities = torch.softmax(text_output, dim=1)
                    text_predicted_class = torch.argmax(text_probabilities, dim=1).item()

                # Process the reply embedding
                single_reply_embedding = torch.tensor(reply_embedding).unsqueeze(0).float()  # Convert to float32
                with torch.no_grad():
                    reply_output = model(single_reply_embedding)
                    reply_probabilities = torch.softmax(reply_output, dim=1)
                    reply_predicted_class = torch.argmax(reply_probabilities, dim=1).item()

                # Store relevant text and reply if their predicted class is relevant
                if text_predicted_class == 1 or reply_predicted_class == 1:
                    if len(all_texts) < MAX_ROWS:
                        relevant_text = processed_texts[idx]
                        relevant_reply = processed_replies[idx]
                        all_texts[i + idx] = {"text": relevant_text, "best_reply": relevant_reply}
                        logging.info(f"Stored relevant text and reply from batch {i // BATCH_SIZE}: {relevant_text[:30]}..., {relevant_reply[:30]}...")

            except Exception as e:
                logging.error(f"Error processing embedding at index {idx}: {e}")


In [26]:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    # Initialize text preprocessor and model (assuming they're already defined elsewhere)
    text_preprocessor = TextPreprocessor()  # Placeholder for loading your preprocessor

    reddit_client = init_reddit_client()

    financial_subreddits = [
        "stocks",
        "investing",
        "personalfinance",
        "finance",
        "StockMarket",
        "valueinvesting",
        "financialindependence",
        "ETF",
        "dividends",
        "business",
        "Economics",
        "investing_discussion"
    ]

    scraped_data = scrape_reddit_data(reddit_client, financial_subreddits)
    print(f"Scraping completed. Total relevant texts stored: {len(scraped_data)}")