<a href="https://colab.research.google.com/github/aarmintia/FTW-Capstone_Guide-to-Stock-Market/blob/main/Scraping_and_Sentiment_Analysis_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd

## Code for Scraping Inquirer for headlines

In [None]:
# Function to scrape headlines for a given date
def scrape_headlines_for_date(date):
    url = f'https://www.inquirer.net/article-index/?d={date.year}-{date.strftime("%m")}-{date.strftime("%d")}'

    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'})

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        i = soup.find('h4', string="BUSINESS")
        k = i.text
        ul = i.findNext('ul')
        v = [li.text for li in ul.findAll('li')]
        output = v

        return output
    else:
        print(f"Failed to fetch data for {date}. Status code: {response.status_code}")
        return []

### Data Cleaning

In [None]:
def cleanInquirerData(data): # Function to remove last 3 words in the string [i.e. 5 years ago]
    clean_data = []
    for i in data:
        temp = i.split()
        temp = temp[:-3]
        new_string = ' '.join(temp)
        clean_data.append(new_string)
    return clean_data

In [None]:
def filterDataBasedOnKeywords(data, keywords):
    data = [x.lower() for x in data]
    keywords = [x.lower() for x in keywords]

    clean_data = []

    for i in data:
        # Use regex to find whole words and check if any keyword is a standalone word in the string
        truthy = any(re.search(rf'\b{re.escape(ele)}\b', i) for ele in keywords)

        if truthy:
            clean_data.append(i)

    return clean_data

### Data Transformation

In [None]:
def getHeadlinesAsDictionary(start_date, end_date, keywords):
    data = {}
    current_date = start_date

    while current_date <= end_date:
        print(current_date)
        key = f'{current_date.year}-{current_date.strftime("%m")}-{current_date.strftime("%d")}'

        # Call webscraping function
        headlines = scrape_headlines_for_date(current_date)

        # Call data processing functions
        headlines = cleanInquirerData(headlines)
        headlines = filterDataBasedOnKeywords(headlines, keywords)

        # Transform data
        data_dict = {}
        data_dict['headlines'] = headlines
        data_dict['sentiments'] = []
        data[key] = data_dict

        # Go next day
        current_date += timedelta(days=1)

    return data


In [None]:
import json

def convertToJson(data, filename):
    with open(filename, "w") as outfile:
        json.dump(data, outfile)

In [None]:
# Define Variables
start_date = datetime(2018, 1, 1)
end_date = datetime(2023, 11, 24)
keywords_dict = {
    'AC': ['ayala corp', 'ac'],
    'ACEN': ['acen', 'acen corporation', 'ac energy'],
    'AEV': ['AEV', 'aboitiz'],
    'AGI': ['agi', 'alliance global'],
    'ALI': ['ali', 'ayala land'],
    'BDO': ['bdo', 'unibank'],
    'BLOOM': ['bloomberry', 'bloom resort'],
    'BPI': ['bpi', 'bank of the philippine islands'],
    'CNPF': ['cnpf', 'century pacific'],
    'CNVRG': ['cnvrg', 'converge'],
    'DMC': ['dmc', 'dmci', 'consunji'],
    'EMI': ['emi', 'emperador'],
    'GLO': ['glo', 'globe'],
    'GTCAP': ['gtcap', 'gt', 'gt capital'],
    'ICT': ['ict', 'international container terminal', 'container terminal service'],
    'JFC': ['jfc', 'jollibee'],
    'JGS': ['jgs', 'jg summit'],
    'LTG': ['ltg', 'lt group'],
    'MBT': ['mbt', 'metrobank', 'metropolitan bank'],
    'MER': ['mer', 'manila electric', 'meralco'],
    'MONDE': ['monde', 'nissin'],
    'NIKL': ['nikl', 'nickel asia', 'nickel'],
    'PGOLD': ['pgold', 'puregold'],
    'SCC': ['scc', 'semirara'],
    'SM': ['sm', 'sm investments'],
    'SMC': ['smc', 'san miguel'],
    'SMPH': ['smph', 'sm prime'],
    'TEL': ['PLDT', 'tel'],
    'URC': ['urc', 'universal robina'],
    'WLCON': ['wlcon', 'wilcon'],
}
# Scrape headlines for the date range
data_ac = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['AC'])
data_acen = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['ACEN'])
data_aev = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['AEV'])
data_agi = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['AGI'])
data_ali = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['ALI'])
data_bdo = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['BDO'])
data_bloom = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['BLOOM'])
data_bpi = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['BPI'])
data_cnpf = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['CNPF'])
data_cnvrg = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['CNVRG'])
data_dmc = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['DMC'])
data_emi = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['EMI'])
data_glo = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['GLO'])
data_gtcap = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['GTCAP'])
data_ict = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['ICT'])
data_jfc = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['JFC'])
data_jgs = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['JGS'])
data_ltg = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['LTG'])
data_mbt = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['MBT'])
data_mer = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['MER'])
data_monde = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['MONDE'])
data_nikl = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['NIKL'])
data_pgold = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['PGOLD'])
data_scc = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['SCC'])
data_sm = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['SM'])
data_smc = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['SMC'])
data_smph = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['SMPH'])
data_tel = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['TEL'])
data_urc = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['URC'])
data_wlcon = getHeadlinesAsDictionary(start_date, end_date, keywords_dict['WLCON'])

2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00
2023-11-24 00:00:00


## Sentiment Analysis Code

In [None]:
from transformers import pipeline

Pre Trained model found in: https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis

In [None]:
dataset_list = {'ac': data_ac, 'acen': data_acen, 'aev': data_aev, 'agi': data_agi,
            'ali': data_ali, 'bdo': data_bdo, 'bloom': data_bloom, 'bpi': data_bpi,
            'cnpf': data_cnpf, 'cnvrg': data_cnvrg, 'dmc': data_dmc, 'emi': data_emi,
            'glo': data_glo, 'gtcap': data_gtcap, 'ict': data_ict, 'jfc': data_jfc,
            'jgs': data_jgs, 'ltg': data_ltg, 'mbt': data_mbt, 'mer': data_mer,
            'monde': data_monde, 'nikl': data_nikl, 'pgold': data_pgold, 'scc': data_scc,
            'sm': data_sm, 'smc': data_smc, 'smph': data_smph, 'tel': data_tel,
            'urc': data_urc, 'wlcon': data_wlcon}  # Add all your datasets to this dictionary with corresponding names

specific_model = pipeline(model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

for name, data in dataset_list.items():
    sentiments = {}
    for x, y in data.items():
        print('Date: ' + x)
        y['sentiments'] = specific_model(y['headlines'])

    # Assuming convertToJson is a function you have defined
    output_filename = f"{name}.json"
    convertToJson(data, output_filename)


Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24
Date: 2023-11-24


In [None]:
# Assuming you have a list of dataset filenames
dataset_filenames = ['/content/ac.json', '/content/acen.json', '/content/aev.json', '/content/agi.json',
                     '/content/ali.json', '/content/bdo.json', '/content/bloom.json', '/content/bpi.json',
                     '/content/cnpf.json', '/content/cnvrg.json', '/content/dmc.json', '/content/emi.json',
                     '/content/glo.json', '/content/gtcap.json', '/content/ict.json', '/content/jfc.json',
                     '/content/jgs.json', '/content/ltg.json', '/content/mbt.json', '/content/mer.json',
                     '/content/monde.json', '/content/nikl.json', '/content/pgold.json', '/content/scc.json',
                     '/content/sm.json', '/content/smc.json', '/content/smph.json', '/content/tel.json',
                     '/content/urc.json', '/content/wlcon.json']

# Initialize an empty list to store DataFrames
dfs = []

for filename in dataset_filenames:
    # Load JSON data from file
    with open(filename) as f:
        data = pd.read_json(f)

    # Transpose the DataFrame
    df_transposed = data.transpose()

    # Convert the 'Date' index to datetime format
    df_transposed.index = pd.to_datetime(df_transposed.index, errors='coerce')

    # Extract 'label' and 'score' from 'Sentiment' column
    df_transposed['Label'] = df_transposed['sentiments'].apply(lambda x: x[0]['label'] if isinstance(x, list) and len(x) > 0 else None)
    df_transposed['Score'] = df_transposed['sentiments'].apply(lambda x: x[0]['score'] if isinstance(x, list) and len(x) > 0 else None)

    # Drop the original 'Sentiment' column
    df_transposed = df_transposed.drop(columns=['sentiments'])

    df_transposed['Label_Sentiment'] = df_transposed['Label'].map({'neutral': 0, 'positive': 1, 'negative': -1})

    # Extract the dataset name from the filename and make it uppercase
    dataset_name = filename.split('/')[-1].split('.')[0].upper()

    # Add a new column 'Name' and move 'Date' to the first column
    df_transposed.insert(0, 'Date', df_transposed.index)
    df_transposed.insert(1, 'Name', dataset_name)

    # Save to CSV with a unique filename
    output_filename = f"{dataset_name}_output.csv"
    df_transposed.to_csv(output_filename, index=False)

    # Append the DataFrame to the list
    dfs.append(df_transposed)

# Concatenate all DataFrames in the list into one DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_file_sentiments.csv', index=False)

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             30 non-null     datetime64[ns]
 1   Name             30 non-null     object        
 2   headlines        30 non-null     object        
 3   Label            1 non-null      object        
 4   Score            1 non-null      float64       
 5   Label_Sentiment  1 non-null      float64       
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 1.5+ KB
