In [75]:
#imports and variables
import datetime
import praw
import csv
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import shutil

from credentials import p_id, p_agent, p_secret, p_pass

total_post_limit = 5000

In [6]:
#connect to reddit API
#check authentication
#select subreddit

reddit = praw.Reddit(
    client_id=p_id,
    client_secret=p_secret,
    password=p_pass,
    user_agent='scrapper_bot',
    username=p_agent,
)

print("Logged in as Reddit user: {}".format(reddit.user.me()))

#define date helper funciton
#function to return date of post
def get_date(submission):
    time = submission.created
    return datetime.datetime.fromtimestamp(time)

Logged in as Reddit user: Adopolis23


In [44]:
start_date = datetime.datetime(2024, 6, 1, 0, 0, 0) 

tickers = ["TSLA"]
stock_price = yf.download(tickers,  start = start_date , end = datetime.datetime.now())
stock_price = stock_price['Close']

stock_data_dates = list(stock_price.index.date)
#stock_data_dates


[*********************100%%**********************]  1 of 1 completed


In [73]:
data = {}
sub_name = 'Stocks'
terms_of_interest = ['TSLA', 'Tesla', 'TESLA']
buffer_size = 20

# Subreddit to search
subreddit = reddit.subreddit(sub_name) 

#for each post in the subreddit
for submission in subreddit.new(limit=total_post_limit):
    submission_date = get_date(submission).date()
    
    #make sure it is on one of the days we are checking
    if submission_date not in stock_data_dates:
        continue
    if submission_date < stock_data_dates[0]:
        break
    if submission_date not in data:
        data[submission_date] = []



    
    total_text = submission.title + ' ' + submission.selftext
    total_text_list = total_text.split()
    
    for i, word in enumerate(total_text_list):
        word = word.replace('.', '').replace('[', '').replace(',', '').replace('?', '').replace(']', '')
        
        if word in terms_of_interest:
            
            left_side = i - int(buffer_size/2)
            right_side = i + int(buffer_size/2)

            if left_side < 0:
                left_side = 0

            if right_side > len(total_text_list)-1:
                right_side = len(total_text_list)-1

            context_string = total_text_list[left_side:right_side]
            data[submission_date].append(context_string)
            
            

        


In [78]:
BUFFER_SIZE = 10000
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42 # for reproducibility

In [76]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 0us/step


In [79]:

# 20000 reviews for train
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# 5000 reviews for validation
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# 25000 reviews for test
test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

print(class_names)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
['neg', 'pos']


In [81]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_ds.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [82]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [84]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [87]:
history = model.fit(train_ds, epochs=1,
                    validation_data=val_ds,
                    validation_steps=1,
                    callbacks=[])

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 212ms/step - accuracy: 0.5357 - loss: 0.6691 - val_accuracy: 0.7521 - val_loss: 0.4628


In [88]:
model.load_weights('rnn/Models/model1.weights.h5')

In [101]:
sample_text = " ".join(data[datetime.datetime(2024, 6, 25, 0, 0, 0).date()][6])
print(sample_text)

prediction = model.predict(np.array([sample_text]).astype(object))
print(prediction[0])


end. With just one week left in the second quarter, Tesla China insurance registrations are up about 14 percent from
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[-0.88435125]
