In [19]:
import matplotlib.pyplot as plt
import numpy as np
import nltk
nltk.download('punkt')
import pandas as pd
import seaborn as sns
import spacy
from datetime import datetime
from nltk.corpus import stopwords
from nltk import tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Define a function to clean the text corpus

In [2]:
def clean_data(news_sentiments: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and preprocess the news data.

    This function takes a DataFrame 'news_sentiments' as input and performs the following data cleaning steps:
    
    1. Remove the publisher's name or source information from the beginning of the news Content if it exists.
    2. Replace newline characters with spaces to ensure proper text formatting.
    3. Remove backslashes ('\') as they are not needed in the text.
    4. Remove asterisks ('*') which may be used for formatting or emphasis.
    5. Remove double hyphens followed by a space ('-- ') which might be formatting artifacts.
    6. Remove single hyphens ('-') that do not provide meaningful information.
    7. Remove en dashes ('–') which may not be relevant in the text.
    8. Remove specific stock market symbols such as "(NYSE: )", "(NS: )", and "(NASDAQ: )" which are often not needed in text analysis.

    Parameters:
    - news_sentiments (DataFrame): A DataFrame containing news data with 'Source' and 'Content' columns.

    Returns:
    - Cleaned DataFrame: A DataFrame with the same structure as 'news_sentiments' after cleaning.

    """
    for i in range(len(news_sentiments)):
        # Check if 'Source' text is in 'Content'
        if news_sentiments['Source'][i] in news_sentiments['Content'][i]:
            # Remove 'Source' text from 'Content' and strip leading whitespace
            news_sentiments['Content'][i] = news_sentiments['Content'][i].replace(news_sentiments['Source'][i], '').lstrip()
        
        # Replace newline characters with spaces
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace('\n', ' ')
        
        # Remove backslashes
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("\\", '')
        
        # Remove asterisks
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("*", '')
        
        # Remove double hyphens followed by a space
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("-- ", '')
        
        # Remove single hyphens
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("-", '')
        
        # Remove en dashes
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("–", '')
        
        # Remove specific stock market symbols
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("(NYSE: )", '')
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("(NS: )", '')
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("©2021 Bloomberg L.P.)" , '' )
        news_sentiments['Content'][i] = news_sentiments['Content'][i].replace("(NASDAQ: )", '')
    
    return news_sentiments

Now, we'll try to shorten the content of the news articles to keep it relevant to sentences that mention the stock names.

In [3]:
def reduce_content_by_stock_name(dataframe: pd.DataFrame,
                                 stock_name: str,
                                 input_column: str,
                                 output_column: str) -> pd.DataFrame:
    """
    Reduce the content of news articles by selecting sentences that mention a specific stock name.

    Args:
    dataframe (pd.DataFrame): The DataFrame containing the news articles.
    stock_name (str): The stock name to identify relevant sentences.
    input_column (str): The name of the input column containing the news article content.
    output_column (str): The name of the output column to store the reduced content.

    Returns:
    pd.DataFrame: The DataFrame with the 'output_column' containing reduced news article content.
    """
    dataframe[output_column] = dataframe[input_column]  # Create a new column for reduced news article content
    for i in range(len(dataframe)):
        selected_sentences = []
        sentences = tokenize.sent_tokenize(dataframe[input_column][i])  # Tokenize the content into sentences
        for sentence in sentences:
            if stock_name in sentence:
                selected_sentences.append(sentence)  # Select sentences containing the stock name
        dataframe[output_column][i] = ' '.join(selected_sentences)  # Join selected sentences into reduced content
    return dataframe

In [30]:
def analyze_stock_news_sentiment(news_data: pd.DataFrame,
                                 stock_keywords: [str, list]) -> (pd.DataFrame,
                                                                  pd.DataFrame):
    """
    Analyzes the sentiment of news articles for specified stock(s).

    This function takes a DataFrame containing news articles and a stock keyword (or list of keywords).
    It filters sentences in the articles that are relevant to the specified stock(s), then performs sentiment analysis
    on this filtered content using VADER and TextBlob. The function returns the average sentiment scores by date
    and a combined DataFrame with aggregated headlines, content, and calculated sentiment scores.

    Parameters:
    news_data (pd.DataFrame): A DataFrame where each row represents a news article. Must contain the columns 'content', 'Headlines', and 'Date'.
    stock_keywords (str or list): A string or a list of strings representing the stock name(s) to filter the news content.

    Returns:
    tuple: A tuple containing two DataFrames:
           1. DataFrame with average sentiment scores by date.
           2. Combined DataFrame with aggregated headlines, content, and calculated sentiment scores.
    """

    nlp = spacy.load("en_core_web_md")
    sid = SentimentIntensityAnalyzer()

    # Check if stock_keywords is a string or list and convert to list if necessary
    if isinstance(stock_keywords, str):
        stock_keywords = [stock_keywords]

    # Tokenizing and filtering relevant sentences for the given stock(s)
    def filter_stock_sentences(text):
        return ' '.join([sent.text for sent in nlp(text).sents if any(stock_keyword in sent.text for stock_keyword in stock_keywords)])

    news_data['Content_reduced'] = news_data['Content'].apply(filter_stock_sentences)

    # Sentiment Analysis
    def get_sentiment_score(text):
        vader_score = sid.polarity_scores(text)['compound']
        textblob_score = TextBlob(text).sentiment.polarity
        return vader_score if vader_score != 0 else textblob_score

    news_data['sent_score'] = news_data['Content_reduced'].apply(get_sentiment_score)

    # Group by Date and Average Sentiment
    sent_mean = news_data.groupby('Date')['sent_score'].mean()

    # Convert all items to strings and handle NaN/None values
    news_data['Headline'] = news_data['Headline'].fillna('').astype(str)
    news_data['Content_reduced'] = news_data['Content_reduced'].fillna('').astype(str)

    # Combine Headlines and Content
    combined_data = news_data.groupby('Date').agg({'Headline': ' '.join, 'Content_reduced': ' '.join})
    combined_data['combined_sentiment'] = combined_data.apply(lambda row: get_sentiment_score(row['Headline'] + ' ' + row['Content_reduced']), axis=1)

    return sent_mean, combined_data

Now, let us apply all the three functions we defined above to our five stocks to generate sentiment scores.

Read the news analysis dataframe for each of the five stocks

In [5]:
# Read news data for meta extracted using web scraping
news_sentiments_meta = pd.read_csv('./data/news_data_facebook-inc-news_50.csv')
news_sentiments_meta.head()

Unnamed: 0,Date,Source,Headline,Link,Content
0,2023-12-17,IANS,Zuckerberg building $100 mn top-secret propert...,https://in.investing.com/news/zuckerberg-build...,"San Francisco, Dec 17 (IANS) Meta (NASDAQ: ) F..."
1,2023-12-16,IANS,Insta head Adam Mosseri reveals Threads’ 2024 ...,https://in.investing.com/news/insta-head-adam-...,"New Delhi, Dec 16 (IANS) Instagram head Adam M..."
2,2023-12-16,IANS,"Meta Quest users can now use Microsoft Word, E...",https://in.investing.com/news/meta-quest-users...,"San Francisco, Dec 16 (IANS) Microsoft (NASDAQ..."
3,2023-12-15,IANS,US reaches H-1B visa cap for FY24,https://in.investing.com/news/us-reaches-h1b-v...,"Washington, Dec 15 (IANS) The US Citizenship a..."
4,2023-12-14,IANS,FBI focuses on 'elevated' risk of outside infl...,https://in.investing.com/news/fbi-focuses-on-e...,"Washington, Dec 14 (IANS) The Federal Bureau o..."


In [6]:
stock_data_meta = pd.read_csv('./data/META.csv')
stock_data_meta.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,27.440001,28.18,27.42,28.0,28.0,69846400
1,2013-01-03,27.879999,28.469999,27.59,27.77,27.77,63140600
2,2013-01-04,28.01,28.93,27.83,28.76,28.76,72715400
3,2013-01-07,28.690001,29.790001,28.65,29.42,29.42,83781800
4,2013-01-08,29.51,29.6,28.860001,29.059999,29.059999,45871300


In [7]:
# Read news data for amazon extracted using web scraping
news_sentiments_amazon = pd.read_csv('./data/news_data_amazon-com-inc-news_50.csv')
news_sentiments_amazon.head()

Unnamed: 0,Date,Source,Headline,Link,Content
0,"Dec 18, 2023 00:00",Investing.com,This 'Magnificent 7' stock is the only mega-ca...,https://in.investing.com/news/pro/this-magnifi...,"(Updated - December 18, 2023 6:19 AM EST)\n\nR..."
1,"Dec 18, 2023 00:00",Investing.com,"Amazon.com PT Raised to $180 at Roth/MKM, Name...",https://in.investing.com/news/pro/amazon-com-i...,An analyst from Roth/MKM maintained Amazon.com...
2,2023-12-15,IANS,US reaches H-1B visa cap for FY24,https://in.investing.com/news/us-reaches-h1b-v...,"Washington, Dec 15 (IANS) The US Citizenship a..."
3,2023-12-15,IANS,‘Project Kuiper’ internet satellites will oper...,https://in.investing.com/news/project-kuiper-i...,"San Francisco, Dec 15 (IANS) Amazon (NASDAQ: )..."
4,2023-12-14,Investing.com,Amazon and 2 others named top internet picks a...,https://in.investing.com/news/pro/amazon-and-2...,Piper Sandler updated its top picks across the...


In [8]:
stock_data_amazon = pd.read_csv('./data/AMZN.csv')
stock_data_amazon.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,12.804,12.905,12.663,12.8655,12.8655,65420000
1,2013-01-03,12.8635,13.044,12.8185,12.924,12.924,55018000
2,2013-01-04,12.879,12.99,12.8325,12.9575,12.9575,37484000
3,2013-01-07,13.1485,13.4865,13.1335,13.423,13.423,98200000
4,2013-01-08,13.3535,13.449,13.1785,13.319,13.319,60214000


In [9]:
# Read news data for apple extracted using web scraping
news_sentiments_apple = pd.read_csv('./data/news_data_apple-computer-inc-news_50.csv')
news_sentiments_apple.head()

Unnamed: 0,Date,Source,Headline,Link,Content
0,"Dec 18, 2023 00:00",Investing.com,Apple to to halt US sales of Watch Series 9 an...,https://in.investing.com/news/apple-to-to-halt...,Apple (NASDAQ: ) is halting sales of its flags...
1,"Dec 18, 2023 00:00",Investing.com,Apple stock slips on report China is ramping u...,https://in.investing.com/news/apple-stock-slip...,"(Updated - December 18, 2023 4:32 AM EST)\n\nA..."
2,"Dec 18, 2023 00:00",IANS,Google to end ‘geofence warrant’ requests for ...,https://in.investing.com/news/google-to-end-ge...,"New Delhi, Dec 18 (IANS) Google (NASDAQ: ) has..."
3,"Dec 18, 2023 00:00",IANS,Apple to let app developers bundle subscriptio...,https://in.investing.com/news/apple-to-let-app...,"San Francisco, Dec 18 (IANS) Apple (NASDAQ: ) ..."
4,"Dec 18, 2023 00:00",IANS,HK media mogul Jimmy Lai's trial begins,https://in.investing.com/news/hk-media-mogul-j...,"Hong Kong, Dec 18 (IANS) The trial of Hong Kon..."


In [10]:
stock_data_apple = pd.read_csv('./data/AAPL.csv')
stock_data_apple.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,19.779285,19.821428,19.343929,19.608213,16.769093,560518000
1,2013-01-03,19.567142,19.631071,19.321428,19.360714,16.55743,352965200
2,2013-01-04,19.1775,19.236786,18.779642,18.821428,16.096228,594333600
3,2013-01-07,18.642857,18.90357,18.4,18.710714,16.001543,484156400
4,2013-01-08,18.900356,18.996071,18.616072,18.76107,16.044607,458707200


In [11]:
# Read news data for Netflix extracted using web scraping
news_sentiments_netflix = pd.read_csv('./data/news_data_netflix,-inc.-news_50.csv')
news_sentiments_netflix.head()

Unnamed: 0,Date,Source,Headline,Link,Content
0,"Dec 18, 2023 00:00",Investing.com,Netflix shares rally as Morgan Stanley ups pri...,https://in.investing.com/news/netflix-shares-r...,Netflix (NASDAQ: ) shares jumped more than 3% ...
1,"Dec 18, 2023 00:00",Investing.com,Morgan Stanley maintains Netflix at 'overweigh...,https://in.investing.com/news/pro/netflix-inc-...,An analyst from Morgan Stanley maintained Netf...
2,2023-12-13,Investing.com,"Midday movers: Pfizer, Etsy, and more",https://in.investing.com/news/pfizer-southwest...,"(Updated - December 13, 2023 12:03 PM EST)\n\n..."
3,2023-12-12,Investing.com,Netflix : we will publish What We Watched: A N...,https://in.investing.com/news/assorted/netflix...,Netflix (NASDAQ: ) announces:\n\nSince launchi...
4,2023-12-13,Investing.com,"Midday movers: Pfizer, Etsy, and more",https://in.investing.com/news/pfizer-southwest...,"(Updated - December 13, 2023 12:03 PM EST)\n\n..."


In [12]:
stock_data_netflix = pd.read_csv('./data/NFLX.csv')
stock_data_netflix.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,13.601429,13.687143,12.955714,13.144286,13.144286,19431300
1,2013-01-03,13.138571,13.988571,13.075714,13.798571,13.798571,27912500
2,2013-01-04,13.791429,13.958571,13.648571,13.711429,13.711429,17761100
3,2013-01-07,13.77,14.535714,13.731429,14.171429,14.171429,45550400
4,2013-01-08,14.287143,14.427143,13.828571,13.88,13.88,24714900


In [13]:
# Read news data for Google extracted using web scraping
news_sentiments_google = pd.read_csv('./data/news_data_google-inc-c-news_50.csv')
news_sentiments_google.head()

Unnamed: 0,Date,Source,Headline,Link,Content
0,"Dec 18, 2023 00:00",IANS,Google to end ‘geofence warrant’ requests for ...,https://in.investing.com/news/google-to-end-ge...,"New Delhi, Dec 18 (IANS) Google (NASDAQ: ) has..."
1,2023-12-16,IANS,'I am all over the place': Rajeev Chandrasekha...,https://in.investing.com/news/i-am-all-over-th...,"New Delhi, Dec 16 (IANS) Union Minister of Sta..."
2,2023-12-16,IANS,US news publisher sues Google for 'siphoning o...,https://in.investing.com/news/us-news-publishe...,"San Francisco, Dec 16 (IANS) A US-based news p..."
3,2023-12-16,IANS,Not the right way to do it: Pichai on laying o...,https://in.investing.com/news/not-the-right-wa...,"San Francisco, Dec 16 (IANS) After almost a ye..."
4,2023-12-15,IANS,Google introduces more repair features for Pix...,https://in.investing.com/news/google-introduce...,"New Delhi, Dec 15 (IANS) Google (NASDAQ: ) has..."


In [14]:
stock_data_goog = pd.read_csv('./data/GOOG.csv')
stock_data_goog.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,18.003504,18.193193,17.931683,18.099348,18.099348,101550348
1,2013-01-03,18.141392,18.316566,18.036036,18.109859,18.109859,92635272
2,2013-01-04,18.251753,18.555305,18.210211,18.467718,18.467718,110429460
3,2013-01-07,18.404655,18.503002,18.282784,18.387136,18.387136,66161772
4,2013-01-08,18.406906,18.425926,18.12888,18.350851,18.350851,66976956


Now, we clean the news dataframe for all the five stocks

In [15]:
news_sentiments_amazon  = clean_data(news_sentiments_amazon)
news_sentiments_apple = clean_data(news_sentiments_apple)
news_sentiments_meta = clean_data(news_sentiments_meta)
news_sentiments_netflix = clean_data(news_sentiments_netflix)
news_sentiments_google = clean_data(news_sentiments_google)

Now, we will reduce the content field for each stock

In [20]:
news_sentiments_amazon = reduce_content_by_stock_name(
    news_sentiments_amazon,
    'Amazon',
    'Content',
    'Content_reduced')

In [22]:
news_sentiments_apple = reduce_content_by_stock_name(
    news_sentiments_apple,
    'Apple',
    'Content',
    'Content_reduced')

In [23]:
news_sentiments_meta = reduce_content_by_stock_name(
    news_sentiments_meta,
    'Meta',
    'Content',
    'Content_reduced')

In [24]:
news_sentiments_netflix = reduce_content_by_stock_name(
    news_sentiments_netflix,
    'Netflix',
    'Content',
    'Content_reduced')

In [25]:
news_sentiments_google = reduce_content_by_stock_name(
    news_sentiments_google,
    'Google',
    'Content',
    'Content_reduced')

Extract sentiments for apple

In [31]:
apple_sentiment, apple_sentiment_mean = analyze_stock_news_sentiment(news_sentiments_apple,
                                                                     'Apple')
apple_sentiment, apple_sentiment_mean

(Date
 2023-05-31            0.044475
 2023-06-01            0.136375
 2023-06-02            0.000000
 2023-06-04            0.452100
 2023-06-05            0.375133
                         ...   
 2023-12-11            0.000000
 2023-12-12            0.459300
 2023-12-14            0.000000
 2023-12-15            0.000000
 Dec 18, 2023 00:00    0.000000
 Name: sent_score, Length: 173, dtype: float64,
                                                              Headline  \
 Date                                                                    
 2023-05-31          Retail sector weighed by economic uncertainty ...   
 2023-06-01          '2000 Deja Vu': Fred Hickey says AI rally is n...   
 2023-06-02          Hundreds join Amazon walkout at HQ over return...   
 2023-06-04          Top 5 things to watch in markets in the week a...   
 2023-06-05          Stock market today: Dow ends lower as Apple cu...   
 ...                                                               ...   
 2

In [34]:
apple_sentiment_mean

Unnamed: 0_level_0,Headline,Content_reduced,combined_sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-05-31,Retail sector weighed by economic uncertainty ...,It's just the ninth company globally to have...,-0.2023
2023-06-01,'2000 Deja Vu': Fred Hickey says AI rally is n...,"He sees the ‘Big Seven’ Apple , Microsoft , A...",-0.2500
2023-06-02,Hundreds join Amazon walkout at HQ over return...,,0.4404
2023-06-04,Top 5 things to watch in markets in the week a...,"The combined weight of five stocks Apple , Mi...",0.9712
2023-06-05,Stock market today: Dow ends lower as Apple cu...,"Investing.com The Dow closed lower Monday, as ...",0.9750
...,...,...,...
2023-12-11,AWS India's interim head Vaishali Kasture quit...,,0.2960
2023-12-12,Amazon's genAI 'Review Highlights' lets users ...,"New Delhi, Dec 12 (IANS) Allen Career Institu...",0.9776
2023-12-14,Amazon and 2 others named top internet picks a...,,0.7650
2023-12-15,US reaches H-1B visa cap for FY24 ‘Project Kui...,,0.7783


In [38]:
amazon_sentiment, amazon_sentiment_mean = analyze_stock_news_sentiment(news_sentiments_apple,
                                                                       'Amazon')
amazon_sentiment, amazon_sentiment_mean

(Date
 2023-05-31            0.587525
 2023-06-01            0.064250
 2023-06-02            0.609980
 2023-06-04            0.802250
 2023-06-05            0.223411
                         ...   
 2023-12-11            0.340000
 2023-12-12            0.636000
 2023-12-14            0.732700
 2023-12-15            0.860650
 Dec 18, 2023 00:00    0.738150
 Name: sent_score, Length: 173, dtype: float64,
                                                              Headline  \
 Date                                                                    
 2023-05-31          Retail sector weighed by economic uncertainty ...   
 2023-06-01          '2000 Deja Vu': Fred Hickey says AI rally is n...   
 2023-06-02          Hundreds join Amazon walkout at HQ over return...   
 2023-06-04          Top 5 things to watch in markets in the week a...   
 2023-06-05          Stock market today: Dow ends lower as Apple cu...   
 ...                                                               ...   
 2

In [37]:
meta_sentiment, meta_sentiment_mean = analyze_stock_news_sentiment(news_sentiments_apple,
                                                                       'Meta')
meta_sentiment, meta_sentiment_mean

(Date
 2023-05-31            0.00000
 2023-06-01            0.01290
 2023-06-02            0.00000
 2023-06-04            0.00000
 2023-06-05            0.00000
                        ...   
 2023-12-11            0.00000
 2023-12-12            0.39820
 2023-12-14            0.00000
 2023-12-15            0.00000
 Dec 18, 2023 00:00    0.08895
 Name: sent_score, Length: 173, dtype: float64,
                                                              Headline  \
 Date                                                                    
 2023-05-31          Retail sector weighed by economic uncertainty ...   
 2023-06-01          '2000 Deja Vu': Fred Hickey says AI rally is n...   
 2023-06-02          Hundreds join Amazon walkout at HQ over return...   
 2023-06-04          Top 5 things to watch in markets in the week a...   
 2023-06-05          Stock market today: Dow ends lower as Apple cu...   
 ...                                                               ...   
 2023-12-11  

In [36]:
netflix_sentiment, netflix_sentiment_mean = analyze_stock_news_sentiment(news_sentiments_apple,
                                                                       'Netflix')
netflix_sentiment, netflix_sentiment_mean

(Date
 2023-05-31            0.0
 2023-06-01            0.0
 2023-06-02            0.0
 2023-06-04            0.0
 2023-06-05            0.0
                      ... 
 2023-12-11            0.0
 2023-12-12            0.0
 2023-12-14            0.0
 2023-12-15            0.0
 Dec 18, 2023 00:00    0.0
 Name: sent_score, Length: 173, dtype: float64,
                                                              Headline  \
 Date                                                                    
 2023-05-31          Retail sector weighed by economic uncertainty ...   
 2023-06-01          '2000 Deja Vu': Fred Hickey says AI rally is n...   
 2023-06-02          Hundreds join Amazon walkout at HQ over return...   
 2023-06-04          Top 5 things to watch in markets in the week a...   
 2023-06-05          Stock market today: Dow ends lower as Apple cu...   
 ...                                                               ...   
 2023-12-11          AWS India's interim head Vaishali Ka

In [35]:
google_sentiment, google_sentiment_mean = analyze_stock_news_sentiment(news_sentiments_apple,
                                                                       'Google')
google_sentiment, google_sentiment_mean

(Date
 2023-05-31            0.06946
 2023-06-01            0.00000
 2023-06-02            0.00000
 2023-06-04            0.45210
 2023-06-05            0.00000
                        ...   
 2023-12-11            0.00000
 2023-12-12            0.39820
 2023-12-14            0.00000
 2023-12-15            0.37150
 Dec 18, 2023 00:00    0.00000
 Name: sent_score, Length: 173, dtype: float64,
                                                              Headline  \
 Date                                                                    
 2023-05-31          Retail sector weighed by economic uncertainty ...   
 2023-06-01          '2000 Deja Vu': Fred Hickey says AI rally is n...   
 2023-06-02          Hundreds join Amazon walkout at HQ over return...   
 2023-06-04          Top 5 things to watch in markets in the week a...   
 2023-06-05          Stock market today: Dow ends lower as Apple cu...   
 ...                                                               ...   
 2023-12-11  

In [39]:
google_sentiment_mean

Unnamed: 0_level_0,Headline,Content_reduced,combined_sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-05-31,Retail sector weighed by economic uncertainty ...,"That's the kind of AI underlying ChatGPT, Go...",-0.3612
2023-06-01,'2000 Deja Vu': Fred Hickey says AI rally is n...,,-0.6597
2023-06-02,Hundreds join Amazon walkout at HQ over return...,,0.4404
2023-06-04,Top 5 things to watch in markets in the week a...,"The combined weight of five stocks Apple , Mi...",0.9712
2023-06-05,Stock market today: Dow ends lower as Apple cu...,,0.1531
...,...,...,...
2023-12-11,AWS India's interim head Vaishali Kasture quit...,,0.2960
2023-12-12,Amazon's genAI 'Review Highlights' lets users ...,"Over the last six months, Allen has attracted...",0.9349
2023-12-14,Amazon and 2 others named top internet picks a...,,0.7650
2023-12-15,US reaches H-1B visa cap for FY24 ‘Project Kui...,An estimated 75 per cent of the H1B visa grant...,0.9607


In [59]:
stock_data_goog.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [60]:
google_sentiment_mean.head()

Unnamed: 0_level_0,Headline,Content_reduced,combined_sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-05-31,Retail sector weighed by economic uncertainty ...,"That's the kind of AI underlying ChatGPT, Go...",-0.3612
2023-06-01,'2000 Deja Vu': Fred Hickey says AI rally is n...,,-0.6597
2023-06-02,Hundreds join Amazon walkout at HQ over return...,,0.4404
2023-06-04,Top 5 things to watch in markets in the week a...,"The combined weight of five stocks Apple , Mi...",0.9712
2023-06-05,Stock market today: Dow ends lower as Apple cu...,,0.1531


In [61]:
google_sentiment_mean['Date of news'] = google_sentiment_mean.index

In [63]:
google_sentiment_mean['Date of news']

Date
2023-05-31                    2023-05-31
2023-06-01                    2023-06-01
2023-06-02                    2023-06-02
2023-06-04                    2023-06-04
2023-06-05                    2023-06-05
                             ...        
2023-12-11                    2023-12-11
2023-12-12                    2023-12-12
2023-12-14                    2023-12-14
2023-12-15                    2023-12-15
Dec 18, 2023 00:00    Dec 18, 2023 00:00
Name: Date of news, Length: 173, dtype: object

In [65]:
import pandas as pd
import re

# Function to standardize date format
def standardize_date(date_str):
    # Check if the date string matches the "YYYY-MM-DD" format using regular expression
    if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
        return date_str  # Leave it as is
    else:
        # Try to convert the date string to datetime format
        try:
            return pd.to_datetime(date_str).strftime('%Y-%m-%d')
        except:
            return date_str  # Return the original value if unable to convert

# Apply the standardize_date function to the 'Date' column
google_sentiment_mean['Date of news'] = google_sentiment_mean['Date of news'].apply(standardize_date)

print(google_sentiment_mean['Date of news'].head())


Date
2023-05-31    2023-05-31
2023-06-01    2023-06-01
2023-06-02    2023-06-02
2023-06-04    2023-06-04
2023-06-05    2023-06-05
Name: Date of news, dtype: object


In [73]:
google_sentiment_mean.head()

Unnamed: 0_level_0,Headline,Content_reduced,combined_sentiment,Date of news
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-05-31,Retail sector weighed by economic uncertainty ...,"That's the kind of AI underlying ChatGPT, Go...",-0.3612,2023-05-31
2023-06-01,'2000 Deja Vu': Fred Hickey says AI rally is n...,,-0.6597,2023-06-01
2023-06-02,Hundreds join Amazon walkout at HQ over return...,,0.4404,2023-06-02
2023-06-04,Top 5 things to watch in markets in the week a...,"The combined weight of five stocks Apple , Mi...",0.9712,2023-06-04
2023-06-05,Stock market today: Dow ends lower as Apple cu...,,0.1531,2023-06-05


In [67]:
google_sentiment_mean['Date of news'] = pd.to_datetime(google_sentiment_mean['Date of news'])


In [69]:
stock_data_goog['Date'] =  pd.to_datetime(stock_data_goog['Date'])

In [74]:
stock_data_goog_with_sentiment = pd.merge(stock_data_goog, google_sentiment_mean[['Date of news','combined_sentiment' ]],  left_on='Date', right_on='Date of news', how='left')

In [76]:
stock_data_goog_with_sentiment['combined_sentiment'].fillna(0, inplace=True)

In [84]:
google_sentiment_mean['Date of news'].min()

Timestamp('2023-05-31 00:00:00')