<a href="https://colab.research.google.com/github/alexandergribenchenko/Test_R5_DE/blob/main/POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
from bs4 import BeautifulSoup
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import pandas as pd
import time,os,requests,nltk
from googletrans import Translator
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import sqlite3

# Download necessary resources from NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def fetch_news_headlines(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    headlines_data = []
    for article in soup.find_all('a', class_='Card-title'):
        headline = article.get_text(strip=True)
        soup_article = BeautifulSoup(str(article), 'html.parser')
        headline_url = soup_article.find('a', class_='Card-title')['href']
        timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        print(f'Fetching data for {headline[:25]}...')
        headlines_data.append({'headline': headline, 'url': headline_url, 'timestamp': timestamp})
    return headlines_data

def analyze_sentiment(headlines_data):
    sentiment_scores = []
    for headline_data in headlines_data:
        headline = headline_data['headline']
        blob = TextBlob(headline)
        sentiment_score = blob.sentiment.polarity
        sentiment_scores.append(sentiment_score)
    return sentiment_scores


def fetch_stock_data(api_key, symbol, time_series='TIME_SERIES_MONTHLY'):
    print(f'Fetching data for {symbol}')
    url = f'https://www.alphavantage.co/query?function={time_series}&symbol={symbol}&apikey={api_key}'
    response = requests.get(url)
    data = response.json()
    stock_data = pd.DataFrame(data['Monthly Time Series']).T
    stock_data.index = pd.to_datetime(stock_data.index)
    return stock_data

def translate_to_spanish(text):
    print(f'Translating {text[:25]}... to Spanish')
    return translator.translate(text, dest='es').text

def translate_to_italian(text):
    print(f'Translating {text[:25]}... to Italian')
    return translator.translate(text, dest='it').text

def extract_relevant_words(text):
    print(f'Extracting relevant words from {text[:25]}...')
    # Tokenize the text into individual words
    words = word_tokenize(text)
    # Remove stop words from the text
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word.lower() in stop_words]
    # Lemmatize the words to their base form
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return folder_name

url = 'https://www.cnbc.com/stocks/'
api_key = 'SZLYEM0FZ8OIMCV3'
symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META']

if __name__ == '__main__':

    print('-----------ETL Project-----------')
    translator = Translator()
    print('-----------CNBC Headlines-----------')
    headlines_data = fetch_news_headlines(url)
    sentiment_scores = analyze_sentiment(headlines_data)

    # Add sentiment scores to the headlines_data list of dictionaries
    for i, headline_data in enumerate(headlines_data):
        headline_data['sentiment_score'] = sentiment_scores[i]

    # Create a Pandas DataFrame from the headlines_data list of dictionaries
    headlines_df = pd.DataFrame(headlines_data)
    headlines_df['sentiment'] = headlines_df['sentiment_score'].apply(lambda x: 'positive' if x >= 0.2 else 'negative' if x <= -0.2 else 'neutral')
    headlines_df['relevant_words'] = headlines_df['headline'].apply(extract_relevant_words)
    # Create new translated columns
    headlines_df['headline_es'] = headlines_df['headline'].apply(translate_to_spanish)
    headlines_df['headline_it'] = headlines_df['headline'].apply(translate_to_italian)

    # Save the headlines DataFrame to a CSV file
    headlines_data_path = './data/headlines'
    headlines_df.to_csv(f'{create_folder(headlines_data_path)}/headlines_data.csv', index=False)

    print('-----------Stocks-----------')
    fetched_data_dfs = []
    for symbol in symbols:
        try:
            stock_data = pd.DataFrame(fetch_stock_data(api_key, symbol))
            stock_data['symbol'] = symbol
            fetched_data_dfs.append(stock_data)
        except Exception as e:
            print(f'Error fetching data for {symbol}, {e}')

    # Concatenate all the DataFrames into a single DataFrame
    stock_data_df = pd.concat(fetched_data_dfs, axis='rows', join='inner')
    stock_data_df = stock_data_df.reset_index()
    stock_data_df = stock_data_df.rename(columns={'index': 'date'})
    stock_data_df['date'] = pd.to_datetime(stock_data_df['date'])
    stock_data_df.rename(columns = {'1. open':'open_price','2. high':'highest_price','3. low':'lowest_price','4. close':'close_price','5. volume':'volume'},inplace=True)

    stocks_data_path = './data/stocks'
    stock_data_df.to_csv(f'{create_folder(stocks_data_path)}/stocks_data.csv', index=False)

#Load the data into a SQLLite database
print('Loading data into SQLite database')
try:
    conn = sqlite3.connect('etl_extended_case.db')
    c = conn.cursor()
    c.execute('CREATE TABLE IF NOT EXISTS headlines (headline TEXT, url TEXT, timestamp TEXT, sentiment_score REAL, sentiment TEXT, relevant_words TEXT, headline_es TEXT, headline_it TEXT)')
    c.execute('CREATE TABLE IF NOT EXISTS stocks (date TEXT, open_price REAL, highest_price REAL, lowest_price REAL, close_price REAL, volume REAL, symbol TEXT)')
    conn.commit()

    #Read the data from the CSV files and load into the SQLLite database
    headlines_df = pd.read_csv('./data/headlines/headlines_data.csv')
    stock_data_df = pd.read_csv('./data/stocks/stocks_data.csv')
    headlines_df.to_sql('headlines', conn, if_exists='replace', index=False)
    stock_data_df.to_sql('stocks', conn, if_exists='replace', index=False)
    conn.close()
    print('ETL Job has been completed successfully')
except Exception as e:
    print(f'Error loading data into SQLite database, {e}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


-----------ETL Project-----------
-----------CNBC Headlines-----------
Fetching data for Nasdaq 100 index to under...
Fetching data for Stocks making the biggest...
Fetching data for Credit Suisse gives its t...
Fetching data for JPMorgan's top equity gur...
Fetching data for Buffett ups bet on energy...
Fetching data for Stocks making the biggest...
Fetching data for Bank of America names Ame...
Fetching data for Goldman Sachs says T-Mobi...
Fetching data for Piper Sandler upgrades Zi...
Fetching data for Buy 'best-in-class' JPMor...
Fetching data for Former CFTC chairman says...
Fetching data for Cramer's Lightning Round:...
Fetching data for China has a new foreign r...
Fetching data for Jim Cramer says investors...
Fetching data for Astra plans a reverse sto...
Fetching data for Meta surges as 100M sign ...
Fetching data for Morgan Stanley offers way...
Fetching data for Earnings season is upon u...
Fetching data for Stocks making the biggest...
Fetching data for The CPI trade: Wha

In [None]:
# Conectarse a la base de datos SQLite
conn = sqlite3.connect('etl_extended_case.db')

In [None]:

# Ejecutar una consulta SQL utilizando Pandas
df = pd.read_sql_query("SELECT * FROM headlines", conn)
df

Unnamed: 0,headline,url,timestamp,sentiment_score,sentiment,relevant_words,headline_es,headline_it
0,Nasdaq 100 index to undergo special rejiggerin...,https://www.cnbc.com/2023/07/11/nasdaq-100-to-...,2023-07-11 17:06:30,0.052381,neutral,"['Nasdaq', '100', 'index', 'undergo', 'special...",El índice Nasdaq 100 se someterá a un reajuste...,L'indice Nasdaq 100 subirà un rejigger special...
1,Stocks making the biggest moves midday: Shutte...,https://www.cnbc.com/2023/07/11/stocks-making-...,2023-07-11 17:06:30,0.0,neutral,"['Stocks', 'making', 'biggest', 'move', 'midda...",Acciones que hacen los mayores movimientos al ...,Le azioni che fanno le mosse più grandi a mezz...
2,Credit Suisse gives its top stock picks for th...,https://www.cnbc.com/2023/07/11/credit-suisse-...,2023-07-11 17:06:30,0.111111,neutral,"['Credit', 'Suisse', 'give', 'top', 'stock', '...",Credit Suisse da sus mejores selecciones de ac...,Credit Suisse offre le sue migliori scelte di ...
3,JPMorgan's top equity guru says avoid stocks a...,https://www.cnbc.com/2023/07/11/jpmorgans-top-...,2023-07-11 17:06:30,0.5,positive,"['JPMorgan', ""'s"", 'top', 'equity', 'guru', 's...",El principal gurú de acciones de JPMorgan dice...,Il principale guru azionario di JPMorgan affer...
4,Buffett ups bet on energy infrastructure,https://www.cnbc.com/2023/07/11/berkshire-hath...,2023-07-11 17:06:30,0.0,neutral,"['Buffett', 'ups', 'bet', 'energy', 'infrastru...",Buffett apuesta por las infraestructuras energ...,Buffett punta sulle infrastrutture energetiche
5,Stocks making the biggest premarket moves: Jet...,https://www.cnbc.com/2023/07/11/stocks-making-...,2023-07-11 17:06:30,0.0,neutral,"['Stocks', 'making', 'biggest', 'premarket', '...",Acciones que realizan los mayores movimientos ...,Le azioni che effettuano le maggiori mosse pre...
6,Bank of America names American Express a top p...,https://www.cnbc.com/2023/07/11/bank-of-americ...,2023-07-11 17:06:30,0.25,positive,"['Bank', 'America', 'name', 'American', 'Expre...",Bank of America nombra a American Express como...,Bank of America nomina American Express una de...
7,Goldman Sachs says T-Mobile is a top pick than...,https://www.cnbc.com/2023/07/11/goldman-sachs-...,2023-07-11 17:06:30,0.4,positive,"['Goldman', 'Sachs', 'say', 'T-Mobile', 'top',...",Goldman Sachs dice que T-Mobile es una de las ...,Goldman Sachs afferma che T-Mobile è la scelta...
8,"Piper Sandler upgrades Zillow, sees real estat...",https://www.cnbc.com/2023/07/11/piper-sandler-...,2023-07-11 17:06:30,0.35,positive,"['Piper', 'Sandler', 'upgrade', 'Zillow', ',',...","Piper Sandler actualiza Zillow, ve que las acc...","Piper Sandler aggiorna Zillow, vede un rally d..."
9,Buy 'best-in-class' JPMorgan thanks to strong ...,https://www.cnbc.com/2023/07/11/buy-best-in-cl...,2023-07-11 17:06:30,0.211111,positive,"['Buy', ""'best-in-class"", ""'"", 'JPMorgan', 'th...",Compre JPMorgan 'mejor en su clase' gracias al...,"Acquista JPMorgan ""best-in-class"" grazie al fo..."
