In [1]:
import yfinance as yf
import datetime as dt
import requests, json, pandas as pd
from bs4 import BeautifulSoup
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType
import pandas as pd
import os
import shutil

PYSPARK_PYTHON = os.getenv("PYSPARK_PYTHON") 
PYSPARK_DRIVER_PYTHON = os.getenv("PYSPARK_DRIVER_PYTHON")
import pyspark
from delta import configure_spark_with_delta_pip, DeltaTable
import json

# Load the configuration JSON file
with open('/usr/local/spark/conf/spark-defaults.json', 'r') as f:
    config = json.load(f)

# Initialize the Spark session builder
builder = pyspark.sql.SparkSession.builder.appName("MyApp1").config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog").config("spark.pyspark.python", PYSPARK_PYTHON)\
    .config("spark.pyspark.driver.python", PYSPARK_DRIVER_PYTHON)

# Read the packages from the text file
packages = []
with open('/usr/local/spark/conf/packages.txt', 'r') as file:
    # Read each line and strip newlines or extra spaces
    packages = [line.strip() for line in file if line.strip()]

# # Add packages to the Spark session configuration
builder.config("spark.jars.packages", ",".join(packages))

# Apply the configurations from the JSON file to the Spark session
for key, value in config.items():
    builder.config(key, value)

# Configure Spark with Delta Lake (if needed)
spark = configure_spark_with_delta_pip(builder).getOrCreate()
# Now you can use the Spark session
spark

In [2]:
dfpd=pd.read_csv('/home/jovyan/Notebooks/Config_Stock.csv')
dfpath=spark.createDataFrame(dfpd)

In [3]:

trgt_path_processed = dfpath.filter(col("DataFeedName") == "Stock_Delta_Path").select('Path').collect()[0][0]
trgt_path_csv = dfpath.filter(col("DataFeedName") == "Stock_CSV_Path").select('Path').collect()[0][0]

In [4]:
def f_roe(financials_info, balance_sheet):
    net_income = financials_info.loc["Net Income"].iloc[0] if "Net Income" in financials_info.index else None
    # Fetch Shareholders' Equity from balance sheet
    total_equity = balance_sheet.loc["Stockholders Equity"].iloc[0] if "Stockholders Equity" in balance_sheet.index else None 
    return (net_income / total_equity) * 100 if type(net_income) == float and type(total_equity) == float else 0.00

In [5]:
def f_roce(financials_info, balance_sheet):
    ebit = financials_info.loc["Operating Income"].iloc[0] if "Operating Income" in financials_info.index else 0

    # # Get Total Assets and Current Liabilities from balance sheet
    total_assets = balance_sheet.loc["Total Assets"].iloc[0] if "Total Assets" in balance_sheet.index else 0
    current_liabilities = balance_sheet.loc["Current Liabilities"].iloc[0] if "Current Liabilities" in balance_sheet.index else 0

    # Calculate Capital Employed
    capital_employed = total_assets - current_liabilities

    return (ebit / capital_employed) * 100 if capital_employed != 0 else 0

In [6]:
def f_PEG(stock_info):
# Calculate PEG ratios
    trailing_pe = stock_info.get("trailingPE", None)
    forward_pe = stock_info.get("forwardPE", None)
    earnings_growth = stock_info.get("earningsGrowth", None)  # Provided as a decimal

    if earnings_growth is not None and earnings_growth > 0:
        
        trailing_peg = trailing_pe / (earnings_growth * 100) if trailing_pe else 0
        forward_peg = forward_pe / (earnings_growth * 100) if forward_pe else 0
        peg_t= trailing_peg if trailing_peg else "N/A"
        peg_f=forward_peg if forward_peg else "N/A"
    else:
        peg_f=peg_t="N/A"
    return peg_t,peg_f

In [7]:
def f_debt_to_equity(balance_sheet):
    total_liabilities = balance_sheet.loc["Total Liabilities Net Minority Interest"].iloc[0] if "Total Liabilities Net Minority Interest" in balance_sheet.index else 0
    shareholders_equity = balance_sheet.loc["Stockholders Equity"].iloc[0] if "Stockholders Equity" in balance_sheet.index else 0
    # Calculate Debt-to-Equity Ratio
    if shareholders_equity != 0:  # Avoid division by zero
        debt_to_equity_ratio = total_liabilities / shareholders_equity
    else:
        debt_to_equity_ratio = "N/A"
    return debt_to_equity_ratio

In [8]:
def f_sales_growth(income_statement):
    revenue = income_statement.loc["Total Revenue"] if "Total Revenue" in income_statement.index else {"0":"NA"}
    revenue = revenue.dropna() if isinstance(revenue, pd.Series) else revenue # Remove any periods with missing data
    # Ensure revenue has at least two periods to calculate growth
    if len(revenue) > 1:
        # Calculate sales growth between the latest two periods
        latest_growth = ((revenue.iloc[0] - revenue.iloc[1]) / revenue.iloc[1]) * 100 if revenue.iloc[1] != 0 else 0
        latest_period = revenue.index[0].strftime("%Y-%m-%d")
    else:
        # Handle cases where there isn't enough data
        latest_growth=0
        latest_period=0
    return latest_growth,latest_period

In [9]:
def f_MA(historical_data):
    # Calculate 50-day and 200-day moving averages
    if not historical_data.empty:
        historical_data["MA50"] = historical_data["Close"].rolling(window=50).mean()
        historical_data["MA200"] = historical_data["Close"].rolling(window=200).mean()
        # Return the latest MA50 and MA200
        latest_data = historical_data.iloc[-1]
        ma50=latest_data["MA50"] if latest_data["MA50"] else 0
        ma200=latest_data["MA200"] if latest_data["MA200"] else 0
    else:
        # Return the latest MA50 and MA200
        latest_data = 0
        ma50=0
        ma200=0
    return ma50, ma200

In [10]:
def f_stock_data(l_tickers):
    ticker_data = []
    headers=["Ticker","Sector","Industry","52_week_high","ROE","ROCE","Trailing_PEG","Forward_PEG","Debt_to_Equity","Latest_Finanacial_Year","Sales_Growth","MA50","MA200"]
    for t in l_tickers:
        ticker = yf.Ticker(t + ('.BO' if t.isdigit() else '.NS'))
        stock_info = ticker.info
        balance_sheet = ticker.balance_sheet
        financials_info=ticker.financials
        income_statement=ticker.income_stmt
        historical_data = ticker.history(period="ytd")
        v_roe=f_roe(financials_info, balance_sheet)
        v_roce=f_roce(financials_info, balance_sheet)
        v_peg_t,v_peg_f=f_PEG(stock_info)
        v_debt_to_equity=f_debt_to_equity(balance_sheet)
        v_sales_growth,v_latest_period=f_sales_growth(income_statement)
        v_ma50,v_ma200=f_MA(historical_data)
        ticker_data.append([t, stock_info.get("sector", "N/A"), stock_info.get("industry", "N/A"), stock_info.get("fiftyTwoWeekHigh", None),v_roe,v_roce,v_peg_t,v_peg_f,v_debt_to_equity,v_latest_period,v_sales_growth,v_ma50,v_ma200])
    df_retun = pd.DataFrame(ticker_data, columns=headers)
    return(df_retun)

In [11]:
# URL to scrape
url_link=["https://www.google.com/finance/markets/gainers","https://www.google.com/finance/markets/losers"]
rows = []
headers=["Ticker","Stock_Name","CMP","Change","Change_Percentage"]
for url in url_link:
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the parent container
        parent_container = soup.find('ul', class_='sbnBtf')
        if parent_container:
            # Find all stock entries within the parent container
            stock_entries = parent_container.find_all('li')
            for stock in stock_entries:
                # Extract relevant details for each stock
                stock_ticker = stock.find('div', class_='COaKTb').text if stock.find('div', class_='COaKTb') else "N/A"
                stock_name = stock.find('div', class_='ZvmM7').text if stock.find('div', class_='ZvmM7') else "N/A"
                stock_price = stock.find('div', class_='YMlKec').text if stock.find('div', class_='YMlKec') else "N/A"
                stock_change = stock.find('div', class_='BAftM').text if stock.find('div', class_='BAftM') else "N/A"
                stock_percent = stock.find('div', class_='zWwE1').text if stock.find('div', class_='zWwE1') else "N/A"
                # Add extracted data to the list
                rows.append([stock_ticker,stock_name,stock_price,stock_change,stock_percent])
    
    # Convert to JSON string with readable characters
df_pd_today = pd.DataFrame(rows, columns=headers)


In [12]:
# Extract unique tickers as a Python list
l_tickers = df_pd_today["Ticker"].unique().tolist()
# Print the result
print(l_tickers)

['ITI', 'BTML', 'INNOVANA', 'THYROCARE', 'ASMS', 'BALAXI', 'SECURKLOUD', 'USK', 'NURECA', 'PTCIL', 'MANCREDIT', 'AVONMORE', 'KIOCL', 'MAHESHWARI', 'CINEVISTA', 'ZOTA', 'ASTEC', 'VASWANI', 'PRAXIS', 'DGCONTENT', 'RBZJEWEL', 'AARTECH', 'VARDMNPOLY', 'PIGL', 'VISASTEEL', 'SUNFLAG', 'ARSSINFRA', 'CCCL', 'BOHRAIND', 'COFFEEDAY', 'SETCO', 'MTEDUCARE', 'KAMOPAINTS', 'OMKARCHEM', 'PREMIER', 'ROLTA', 'BGLOBAL', 'JTLIND', 'AEGISLOG', 'FCONSUMER', 'BCG', 'FSC', 'INDOTHAI', 'MARSHALL', 'CREST', 'FILATFASH', 'EQUITASBNK', 'QUINTEGRA', 'CCHHL', 'GLOBE', 'SPCENET', 'SIKKO-RE', 'AVON-RE', 'JAICORPLTD', 'CORALFINAC', 'SHANTI', 'BAJAJHCARE', 'SIYSIL', 'KARMAENG', 'UNITEDTEA', 'ASHOKA', 'SADBHIN', 'DELTAMAGNT', 'PRAENG', 'SINCLAIR', 'VIDHIING', 'UMAEXPORTS', 'BIOFILCHEM', 'VAKRANGEE', 'PREMEXPLN', 'JINDALPHOT', 'VINEETLAB', 'EMBDL', 'HIMATSEIDE', 'STEL', 'IXIGO', 'STARTECK', 'YATRA', 'STYLEBAAZA', 'SHIVAMAUTO', 'UNIONBANK', 'ITDC', 'SURYAROSNI', 'MONARCH', 'SWANENERGY', 'RIIL', 'PDMJEPAPER', 'GINNIFILA',

In [13]:
# t=("750940")
# ticker = yf.Ticker(t + ".BO" if t.isdigit() else t + ".NS")
# stock_info = ticker.info
# balance_sheet = ticker.balance_sheet
# financials_info=ticker.financials
# income_statement=ticker.income_stmt
# revenue = income_statement.loc["Total Revenue"] if "Total Revenue" in income_statement.index else {"0":"NA"}
# revenue = revenue.dropna() if isinstance(revenue, pd.Series) else revenue # Remove any periods with missing data
# # Ensure revenue has at least two periods to calculate growth
# if len(revenue) > 1:
#     # Calculate sales growth between the latest two periods
#     latest_growth = ((revenue.iloc[0] - revenue.iloc[1]) / revenue.iloc[1]) * 100 if revenue.iloc[1] != 0 else 0
#     latest_period = revenue.index[0].strftime("%Y-%m-%d")
# else:
#     # Handle cases where there isn't enough data
#     latest_growth=0.0
#     latest_period=0.0
# print(len(revenue))
# print(latest_growth,latest_period)

In [14]:
df_stock_data=f_stock_data(l_tickers)


SIKKO-RE.NS: Period 'ytd' is invalid, must be one of ['1d', '5d']


$AVON-RE.NS: possibly delisted; no price data found  (period=ytd)


In [15]:
df_custom=df_pd_today.merge(df_stock_data, on='Ticker',how='left')
reorder_colms=["Ticker","Stock_Name","Sector","Industry","CMP","Change","Change_Percentage"]+[col for col in df_custom.columns if col not in ["Ticker","Stock_Name","Sector","Industry","CMP","Change","Change_Percentage"]]
df_spark=spark.createDataFrame(df_custom[reorder_colms])

In [16]:
df_master = df_spark.filter(
                            (col("ROE") >= 15) &
                            (col("ROCE") >= 15) &
                            (col("Debt_to_Equity") <= 1) &
                            (col("MA50") >= col("MA200"))
                            ).withColumn("ROE",coalesce(round(col("ROE"), 2),lit(0))) \
                            .withColumn("ROCE", coalesce(round(col("ROCE"), 2),lit(0))) \
                            .withColumn("Trailing_PEG", coalesce(round(col("Trailing_PEG"), 2),lit(0))) \
                            .withColumn("Forward_PEG", coalesce(round(col("Forward_PEG"), 2),lit(0))) \
                            .withColumn("Debt_to_Equity", coalesce(round(col("Debt_to_Equity"), 2),lit(0))) \
                            .withColumn("Sales_Growth", coalesce(round(col("Sales_Growth"), 2), lit(0))) \
                            .withColumn("MA50", coalesce(round(col("MA50"), 2), lit(0))) \
                            .withColumn("MA200", coalesce(round(col("MA200"), 2), lit(0))) \
                            .withColumn(
                                "Gainer_Looser",
                                when(
                                    regexp_replace(col("Change"), "₹", "").cast("float") < 0.0, "L"
                                ).otherwise("G")
                            ).withColumn('WatchOutFlag', lit(0))

In [17]:
if DeltaTable.isDeltaTable(spark, trgt_path_processed):
    df_read=spark.read.format('delta').load(trgt_path_processed)
    df_repeat=df_master.join(
        df_read.select("Ticker","UpdateTimestamp"), on='Ticker', how='inner')\
            .withColumn('RunTimeStamp',current_timestamp())\
            .withColumn("WatchOutFlag", when(
                col('RunTimeStamp') > date_add(col("UpdateTimestamp"), 1), (col('WatchOutFlag') + 1)).otherwise(col('WatchOutFlag'))) \
            .withColumn("UpdateTimestamp",  when(
                        col('RunTimeStamp') > date_add(col("UpdateTimestamp"), 1),
                        to_timestamp(date_format(current_date(), format="yyyy-MM-dd 10:30:00"))).otherwise(col('UpdateTimestamp')))\
            .drop('RunTimeStamp')
    df_output=df_master\
        .drop('WatchOutFlag')\
        .join(df_repeat.select('Ticker','WatchOutFlag',"UpdateTimestamp"), on='Ticker',how='left') \
        .withColumn('WatchOutFlag', coalesce(col("WatchOutFlag"), lit(0)).cast("int"))
        
else:
    df_output=df_master.withColumn('WatchOutFlag',lit(0))\
                .withColumn('UpdateTimestamp', date_format(current_timestamp(), format="yyyy-MM-dd 10:30:00").cast('timestamp'))
df_output.withColumn("PKSK", xxhash64(col("Ticker")).cast("string"))\
        .withColumn("RowSK", xxhash64(concat_ws("|", *[col(c) for c in df_output.columns])))\
        .createOrReplaceTempView('vw_source')

In [18]:
if DeltaTable.isDeltaTable(spark, trgt_path_processed):
    column_name = df_output.columns
    set_clause = ", ".join([f"target.{i} = source.{i}" for i in column_name])
    insert_clause=",".join(column_name)
    insert_values=",".join([f"source.{i}" for i in column_name])
    query = f"""MERGE INTO delta.`{trgt_path_processed}` AS target 
            USING vw_source AS source 
            ON target.PKSK = source.PKSK 
            AND target.RowSK <> source.RowSK 
            WHEN MATCHED THEN UPDATE SET {set_clause}
            WHEN NOT MATCHED THEN INSERT ({insert_clause}) VALUES ({insert_values})"""
    spark.sql(query)        
else :
    query=f"CREATE TABLE delta.`{trgt_path_processed}` USING DELTA AS SELECT * FROM vw_source"
    spark.sql(query)

print(query)

MERGE INTO delta.`/mnt/Stock_market_data/Processed_Parquet/` AS target 
            USING vw_source AS source 
            ON target.PKSK = source.PKSK 
            AND target.RowSK <> source.RowSK 
            WHEN MATCHED THEN UPDATE SET target.Ticker = source.Ticker, target.Stock_Name = source.Stock_Name, target.Sector = source.Sector, target.Industry = source.Industry, target.CMP = source.CMP, target.Change = source.Change, target.Change_Percentage = source.Change_Percentage, target.52_week_high = source.52_week_high, target.ROE = source.ROE, target.ROCE = source.ROCE, target.Trailing_PEG = source.Trailing_PEG, target.Forward_PEG = source.Forward_PEG, target.Debt_to_Equity = source.Debt_to_Equity, target.Latest_Finanacial_Year = source.Latest_Finanacial_Year, target.Sales_Growth = source.Sales_Growth, target.MA50 = source.MA50, target.MA200 = source.MA200, target.Gainer_Looser = source.Gainer_Looser, target.WatchOutFlag = source.WatchOutFlag, target.UpdateTimestamp = source.UpdateT

In [19]:
df_read=spark.read.format('delta').load(trgt_path_processed)

In [20]:
# Save the DataFrame to a CSV file
spark.read.format("delta").load(trgt_path_processed)\
    .coalesce(1).write.format("csv").option("header","true").mode("overwrite").save(trgt_path_csv)

In [21]:
trgt_copy_path = trgt_path_csv + "processed.csv"
files=os.listdir(trgt_path_csv)
selected_files = [file for file in files if file.startswith('part-00') and file.endswith('.csv')]
file=trgt_path_csv + selected_files[0]
print(selected_files)
shutil.copy(file, trgt_copy_path)

['part-00000-eb853d6f-4ece-44bf-8989-f4b0a7ddcecd-c000.csv']


'/mnt/Stock_market_data/Processed/processed.csv'

In [22]:
delete_log = [file for file in files if "processed.csv" != file ]
for file in delete_log :
    os.remove(trgt_path_csv + file)
    print(f"removed {trgt_path_csv + file}")

removed /mnt/Stock_market_data/Processed/._SUCCESS.crc
removed /mnt/Stock_market_data/Processed/_SUCCESS
removed /mnt/Stock_market_data/Processed/.part-00000-eb853d6f-4ece-44bf-8989-f4b0a7ddcecd-c000.csv.crc
removed /mnt/Stock_market_data/Processed/part-00000-eb853d6f-4ece-44bf-8989-f4b0a7ddcecd-c000.csv
