In [227]:
import yfinance as yf
import datetime as dt
import requests, json, pandas as pd
from bs4 import BeautifulSoup
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType
import pandas as pd
import os
import shutil

PYSPARK_PYTHON = os.getenv("PYSPARK_PYTHON") 
PYSPARK_DRIVER_PYTHON = os.getenv("PYSPARK_DRIVER_PYTHON")
import pyspark
from delta import configure_spark_with_delta_pip, DeltaTable
import json

# Load the configuration JSON file
with open('/usr/local/spark/conf/spark-defaults.json', 'r') as f:
    config = json.load(f)

# Initialize the Spark session builder
builder = pyspark.sql.SparkSession.builder.appName("MyApp1").config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog").config("spark.pyspark.python", PYSPARK_PYTHON)\
    .config("spark.pyspark.driver.python", PYSPARK_DRIVER_PYTHON)

# Read the packages from the text file
packages = []
with open('/usr/local/spark/conf/packages.txt', 'r') as file:
    # Read each line and strip newlines or extra spaces
    packages = [line.strip() for line in file if line.strip()]

# # Add packages to the Spark session configuration
builder.config("spark.jars.packages", ",".join(packages))

# Apply the configurations from the JSON file to the Spark session
for key, value in config.items():
    builder.config(key, value)

# Configure Spark with Delta Lake (if needed)
spark = configure_spark_with_delta_pip(builder).getOrCreate()
# Now you can use the Spark session
spark

In [228]:
def f_roe(financials_info, balance_sheet):
    net_income = financials_info.loc["Net Income"].iloc[0] if "Net Income" in financials_info.index else None
    # Fetch Shareholders' Equity from balance sheet
    total_equity = balance_sheet.loc["Stockholders Equity"].iloc[0] if "Stockholders Equity" in balance_sheet.index else None 
    return (net_income / total_equity) * 100 if type(net_income) == float and type(total_equity) == float else 0.00

In [229]:
def f_roce(financials_info, balance_sheet):
    ebit = financials_info.loc["Operating Income"].iloc[0] if "Operating Income" in financials_info.index else 0

    # # Get Total Assets and Current Liabilities from balance sheet
    total_assets = balance_sheet.loc["Total Assets"].iloc[0] if "Total Assets" in balance_sheet.index else 0
    current_liabilities = balance_sheet.loc["Current Liabilities"].iloc[0] if "Current Liabilities" in balance_sheet.index else 0

    # Calculate Capital Employed
    capital_employed = total_assets - current_liabilities

    return (ebit / capital_employed) * 100 if capital_employed != 0 else 0

In [230]:
def f_PEG(stock_info):
# Calculate PEG ratios
    trailing_pe = stock_info.get("trailingPE", None)
    forward_pe = stock_info.get("forwardPE", None)
    earnings_growth = stock_info.get("earningsGrowth", None)  # Provided as a decimal

    if earnings_growth is not None and earnings_growth > 0:
        
        trailing_peg = trailing_pe / (earnings_growth * 100) if trailing_pe else None
        forward_peg = forward_pe / (earnings_growth * 100) if forward_pe else None
        peg_t= trailing_peg if trailing_peg else "N/A"
        peg_f=forward_peg if forward_peg else "N/A"
    else:
        peg_f=peg_t="N/A"
    return peg_t,peg_f

In [231]:
def f_debt_to_equity(balance_sheet):
    total_liabilities = balance_sheet.loc["Total Liabilities Net Minority Interest"].iloc[0] if "Total Liabilities Net Minority Interest" in balance_sheet.index else 0
    shareholders_equity = balance_sheet.loc["Stockholders Equity"].iloc[0] if "Stockholders Equity" in balance_sheet.index else 0
    # Calculate Debt-to-Equity Ratio
    if shareholders_equity != 0:  # Avoid division by zero
        debt_to_equity_ratio = total_liabilities / shareholders_equity
    else:
        debt_to_equity_ratio = "N/A"
    return debt_to_equity_ratio

In [232]:
def f_sales_growth(income_statement):
    revenue = income_statement.loc["Total Revenue"] if "Total Revenue" in income_statement.index else {"0":"NA"}
    revenue = revenue.dropna() if isinstance(revenue, pd.Series) else revenue # Remove any periods with missing data
    # Ensure revenue has at least two periods to calculate growth
    if len(revenue) > 1:
        # Calculate sales growth between the latest two periods
        latest_growth = ((revenue.iloc[0] - revenue.iloc[1]) / revenue.iloc[1]) * 100 if revenue.iloc[1] != 0 else 0
        latest_period = revenue.index[0].strftime("%Y-%m-%d")
    else:
        # Handle cases where there isn't enough data
        latest_growth=0
        latest_period=0
    return latest_growth,latest_period

In [233]:
def f_MA(historical_data):
    # Calculate 50-day and 200-day moving averages
    if not historical_data.empty:
        historical_data["MA50"] = historical_data["Close"].rolling(window=50).mean()
        historical_data["MA200"] = historical_data["Close"].rolling(window=200).mean()
        # Return the latest MA50 and MA200
        latest_data = historical_data.iloc[-1]
        ma50=latest_data["MA50"]
        ma200=latest_data["MA200"]
    else:
        # Return the latest MA50 and MA200
        latest_data = 0
        ma50=0
        ma200=0
    return ma50, ma200

In [234]:
def f_stock_data(l_tickers):
    ticker_data = []
    for t in l_tickers:
        ticker = yf.Ticker(t.split('.')[0] + ('.BO' if t.split('.')[1]=='BSE' else '.NS'))
        stock_info = ticker.info
        balance_sheet = ticker.balance_sheet
        financials_info=ticker.financials
        income_statement=ticker.income_stmt
        try:
            historical_data = ticker.history(period="ytd")
        except Exception as e:
            print(e)
        v_roe=f_roe(financials_info, balance_sheet)
        v_roce=f_roce(financials_info, balance_sheet)
        v_peg_t,v_peg_f=f_PEG(stock_info)
        v_debt_to_equity=f_debt_to_equity(balance_sheet)
        v_sales_growth,v_latest_period=f_sales_growth(income_statement)
        v_ma50,v_ma200=f_MA(historical_data)
        ticker_data.append(
            {
                "Ticker":t,
                "52-week_high":stock_info.get("fiftyTwoWeekHigh", None),
                "ROE":v_roe,
                "ROCE":v_roce,
                "Trailing_PEG":v_peg_t,
                "Forward_PEG":v_peg_f,
                "Debt-to-Equity":v_debt_to_equity,
                "Latest_Finanacial_Year":v_latest_period,
                "Sales_Growth":v_sales_growth,
                "MA50":v_ma50,
                "MA200":v_ma200
            }
        )
            
    json_data = json.dumps(ticker_data, indent=4, ensure_ascii=False)
    return(json_data)

In [250]:
# URL to scrape
url_link=["https://www.google.com/finance/markets/gainers"]
all_stock_data = []
for url in url_link:
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the parent container
        parent_container = soup.find('ul', class_='sbnBtf')
        if parent_container:
            # Find all stock entries within the parent container
            stock_entries = parent_container.find_all('li')
            for stock in stock_entries:
                # Extract relevant details for each stock
                stock_ticker = stock.find('div', class_='COaKTb').text if stock.find('div', class_='COaKTb') else "N/A"
                stock_name = stock.find('div', class_='ZvmM7').text if stock.find('div', class_='ZvmM7') else "N/A"
                stock_price = stock.find('div', class_='YMlKec').text if stock.find('div', class_='YMlKec') else "N/A"
                
                # Add extracted data to the list
                all_stock_data.append({
                    "name": stock_ticker,
                    "price": stock_name,
                    "change": stock_price
                })
    
    # Convert to JSON string with readable characters
# df_pd_today = pd.DataFrame(rows, columns=headers)
# df_pd_today["Ticker"]=tickers
# df_pd_today = df_pd_today[['Ticker'] + [col for col in df_pd_today.columns if col != 'Ticker']]


In [236]:
# Extract unique tickers as a Python list
l_tickers = df_pd_today["Ticker"].unique().tolist()
# Print the result
print(l_tickers)

In [237]:
# t=("750940")
# ticker = yf.Ticker(t + ".BO" if t.isdigit() else t + ".NS")
# stock_info = ticker.info
# balance_sheet = ticker.balance_sheet
# financials_info=ticker.financials
# income_statement=ticker.income_stmt
# revenue = income_statement.loc["Total Revenue"] if "Total Revenue" in income_statement.index else {"0":"NA"}
# revenue = revenue.dropna() if isinstance(revenue, pd.Series) else revenue # Remove any periods with missing data
# # Ensure revenue has at least two periods to calculate growth
# if len(revenue) > 1:
#     # Calculate sales growth between the latest two periods
#     latest_growth = ((revenue.iloc[0] - revenue.iloc[1]) / revenue.iloc[1]) * 100 if revenue.iloc[1] != 0 else 0
#     latest_period = revenue.index[0].strftime("%Y-%m-%d")
# else:
#     # Handle cases where there isn't enough data
#     latest_growth=0.0
#     latest_period=0.0
# print(len(revenue))
# print(latest_growth,latest_period)

In [238]:
stock_data=f_stock_data(l_tickers)



In [239]:
# Convert JSON string to Python object (list of dictionaries)
function_data = json.loads(stock_data)

# Convert list of dictionaries to a pandas DataFrame
df_pd = pd.DataFrame(function_data)
df_custom=df_pd_today.merge(df_pd, on='Ticker',how='left')
df_spark=spark.createDataFrame(df_custom)

In [246]:
df_master = df_spark.filter(
    (col("ROE") >= 15) &
    (col("ROCE") >= 15) &
    (col("Debt-to-Equity") <= 1) &
    (col('MA50') >= col('MA200'))
).withColumn("ROE", round(col('ROE'), 2)) \
 .withColumn("ROCE", round(col('ROCE'), 2)) \
 .withColumn("Trailing PEG", round(col("Trailing PEG"), 2)) \
 .withColumn("Forward PEG", round(col("Forward PEG"), 2)) \
 .withColumn("Debt-to-Equity", round(col("Debt-to-Equity"), 2)) \
 .withColumn("Sales Growth", round(col("Sales Growth"), 2)) \
 .withColumn("MA50", round(col("MA50"), 2)) \
 .withColumn("MA200", round(col("MA200"), 2)) \
 .withColumn(
     "Gainer/Looser",
     when(
         regexp_replace(col('change'), "₹", "").cast('float') < 0.0, "L"
     ).otherwise("G"))

In [44]:
# df_master.write.format('delta').save("/mnt/Stock_market_date/Processed_Parquet")

In [247]:
df_read=spark.read.format('delta').load("/mnt/Stock_market_date/Processed_Parquet")
display(df_read)

In [None]:
news=test.info
x2=test.news
print(x2)

In [None]:


x=news.get('longName', 'NA')
print(x)