In [23]:
import os
import praw
import pandas as pd
from dotenv import load_dotenv
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

load_dotenv()

def fetch_reddit_sentiment(subreddits=["renewableenergy", "energy", "Texas"], 
                           keywords=["Texas wind", "Texas solar"], 
                           limit=100):
    try:
        
        client_id = os.getenv("reddit_client_id")
        client_secret = os.getenv("reddit_client_secret")
        user_agent = os.getenv("reddit_user_agent")
        username = os.getenv("username")
        password = os.getenv("password")
        username = username.strip('",\' ')
        password = password
        # Authenticate with Reddit
        reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent,
            username=username,
            password=password
        )
        
        # Sentiment analyzer
        analyzer = SentimentIntensityAnalyzer()
        data = []

        # Loop through subreddits and fetch matching posts
        for subreddit in subreddits:
            query = " OR ".join(keywords)
            for submission in reddit.subreddit(subreddit).search(query, limit=limit):
                combined_text = (submission.title or "") + " " + (submission.selftext or "")
                sentiment = analyzer.polarity_scores(combined_text)
                data.append({
                    'subreddit': subreddit,
                    'date': pd.to_datetime(submission.created_utc, unit='s'),
                    'text': combined_text,
                    'sentiment_score': sentiment['compound'],
                    'title': submission.title,
                    'url': submission.url
                })

        return pd.DataFrame(data)

    except Exception as e:
        print(f"Error fetching Reddit data: {e}")
        return pd.DataFrame()


In [24]:
df = fetch_reddit_sentiment()
df.head()

Unnamed: 0,subreddit,date,text,sentiment_score,title,url
0,renewableenergy,2025-01-13 15:02:26,"Texas leads U.S. in wind, solar, No. 2 in batt...",0.2732,"Texas leads U.S. in wind, solar, No. 2 in batt...",https://www.chron.com/news/houston-texas/artic...
1,renewableenergy,2025-05-02 04:12:44,Texas House passes bill to require recycling o...,0.0,Texas House passes bill to require recycling o...,https://pv-magazine-usa.com/2025/05/01/texas-h...
2,renewableenergy,2025-04-17 15:56:10,"Wind, solar, and battery storage projects are ...",0.0,"Wind, solar, and battery storage projects are ...",https://yaleclimateconnections.org/2025/03/cle...
3,renewableenergy,2025-03-10 20:17:45,"Texas broke its solar, wind, and battery recor...",-0.4215,"Texas broke its solar, wind, and battery recor...",https://www.canarymedia.com/articles/clean-ene...
4,renewableenergy,2022-03-22 20:14:11,Texas has enough solar and wind planned to per...,0.0,Texas has enough solar and wind planned to per...,https://pv-magazine-usa.com/2022/03/22/solar-a...


In [25]:
df_copy = df.copy()
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subreddit        300 non-null    object        
 1   date             300 non-null    datetime64[ns]
 2   text             300 non-null    object        
 3   sentiment_score  300 non-null    float64       
 4   title            300 non-null    object        
 5   url              300 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 14.2+ KB


In [37]:
x = df_copy['sentiment_score'].unique().tolist()
y = [i for i in x if i>0]
y

[0.2732,
 0.34,
 0.4215,
 0.4019,
 0.8612,
 0.3612,
 0.7003,
 0.5719,
 0.8805,
 0.4939,
 0.0772,
 0.5859,
 0.5994,
 0.9212,
 0.9505,
 0.9994,
 0.9908,
 0.8442,
 0.8708,
 0.6369,
 0.2023,
 0.5106,
 0.3412,
 0.1531,
 0.25,
 0.7845,
 0.4404,
 0.6705,
 0.7184,
 0.8316,
 0.5267,
 0.6864,
 0.3818,
 0.3197,
 0.6486,
 0.9153,
 0.7227,
 0.8889,
 0.3182,
 0.6875,
 0.6249,
 0.6908,
 0.0516,
 0.7305,
 0.7,
 0.885,
 0.8979,
 0.6432,
 0.9573,
 0.6597,
 0.0258,
 0.9974,
 0.5396,
 0.1779,
 0.0754,
 0.8891,
 0.8206,
 0.7672,
 0.6124,
 0.743,
 0.6808,
 0.09,
 0.1027,
 0.8992,
 0.9201,
 0.8315,
 0.1933,
 0.8779,
 0.9633,
 0.7506,
 0.7783,
 0.5647,
 0.7263,
 0.3384,
 0.9886,
 0.5571,
 0.7906,
 0.4588,
 0.6416]

In [38]:
os.getcwd().rsplit("\\", 2)[0]
df_copy.to_csv(os.path.join(os.getcwd().rsplit("\\", 2)[0], r"data\raw", "reddit_sentiment_raw.csv"), index=False)

## Fetch News Sentiment:

In [103]:
from bs4 import BeautifulSoup
import requests

urls = r"https://www.texastribune.org/topics/energy/"
keywords = ["wind", "solar"]
analyzer = SentimentIntensityAnalyzer()


In [121]:
import re
from dateutil import parser
import pandas as pd
def fetch_news_sentiment(urls=["https://www.texastribune.org/topics/energy/"], keywords=["wind","renewable energy", "solar"], max_pages=20):
    try:
        analyzer = SentimentIntensityAnalyzer()
        data = []
        for url in urls:
            for page in range(1, max_pages + 1):
                paged_url = f"{url}?page={page}" if page > 1 else url
                response = requests.get(paged_url)
                soup = BeautifulSoup(response.text, 'html.parser')
                articles = soup.find_all('article')
                for article in articles:
                    text = article.get_text()
                    date_match = re.search(r'(JAN.|FEB.|MARCH|APRIL|MAY|JUNE|JULY|AUG.|SEP.|OCT.|NOV.|DEC.)[A-Z]*\.?\s+\d{1,2},\s+\d{4}', text)
                    if date_match:
                        try:
                            date = parser.parse(date_match.group())
                        except Exception:
                            date = pd.to_datetime('now')
                    else:
                        date = pd.to_datetime('now')
                    if (pd.Timestamp.now() - date).days <= 365*10:
                        if any(keyword.lower() in text.lower() for keyword in keywords):
                            sentiment = analyzer.polarity_scores(text)
                            data.append({
                                'date': date,
                                'text': text,
                                'sentiment_score': sentiment['compound']
                            })
        df = pd.DataFrame(data)
        if len(df) < 10:
            print("Warning: Less than 10 records found.")
        return df.head(10)
    except Exception as e:
        print(f"Error fetching news data: {e}")
        return pd.DataFrame()


In [134]:
import re
import requests
from bs4 import BeautifulSoup
from dateutil import parser
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def fetch_news_sentiment(
    urls=["https://www.texastribune.org/topics/energy/"],
    keywords=["wind", "renewable energy", "solar"],
    years_back=10
):
    try:
        analyzer = SentimentIntensityAnalyzer()
        data = []
        cutoff = pd.Timestamp.now() - pd.Timedelta(days=365 * years_back)

        for url in urls:
            page = 1
            stop_paging = False
            while not stop_paging:
                paged_url = f"{url}?page={page}" if page > 1 else url
                response = requests.get(paged_url)
                if not response.ok:
                    break

                soup = BeautifulSoup(response.text, 'html.parser')
                articles = soup.find_all('article')
                if not articles:
                    break

                for article in articles:
                    text = article.get_text()
                    # Find a date in text
                    date_match = re.search(r'(JAN\.|FEB\.|MARCH|APRIL|MAY|JUNE|JULY|AUG\.|SEP\.|OCT\.|NOV\.|DEC\.)[A-Z]*\.?\s+\d{1,2},\s+\d{4}', text)
                    if date_match:
                        try:
                            date = parser.parse(date_match.group())
                        except Exception:
                            date = pd.to_datetime('now')
                    else:
                        date = pd.to_datetime('now')

                    # Only analyze if date is within range
                    if date >= cutoff:
                        if any(keyword.lower() in text.lower() for keyword in keywords):
                            sentiment = analyzer.polarity_scores(text)
                            data.append({
                                'date': date,
                                'text': text,
                                'sentiment_score': sentiment['compound']
                            })
                    else:
                        # If ALL articles are old, we can stop after this page
                        stop_paging = True   # Will break outer while after for loop

                page += 1  # move to next page

        df = pd.DataFrame(data)
        return df
    except Exception as e:
        print(f"Error fetching news data: {e}")
        return pd.DataFrame()


In [135]:
df_news = fetch_news_sentiment()


In [136]:
df_news.head()
df_news.tail()

Unnamed: 0,date,text,sentiment_score
168,2025-07-22 20:34:51.914708,"\n\n\nThe Green Mile\n\n\n By Ben Philpott,...",0.5423
169,2025-07-22 20:34:51.915693,\n\n\nA Conversation with T. Boone Pickens\n\n...,-0.0772
170,2025-07-22 20:34:52.932165,\n\n\nWind in the Wires\n\n\n By Kate Galbr...,-0.0258
171,2025-07-22 20:34:52.933169,\n\n\nTribWeek: In Case You Missed It\n\n\n ...,0.9531
172,2025-07-22 20:34:52.935154,\n\n\nDon't Blow It\n\n\n By Kate Galbraith...,0.6757


In [137]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             173 non-null    datetime64[ns]
 1   text             173 non-null    object        
 2   sentiment_score  173 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 4.2+ KB


In [None]:
os.getcwd().rsplit("\\", 2)[0]
df_news.to_csv(os.path.join(os.getcwd().rsplit("\\", 2)[0], r"data\raw", "news_sentiment_raw.csv"), index=False)