# Piotriski Strategy Data Mining

### Get Data

In [1]:
import requests 
import pandas as pd
from bs4 import BeautifulSoup 
import json
def get_data(URL):
    stockFinancialData = []
    try:
        r = requests.get(URL) 
        soup = BeautifulSoup(r.content, 'html5lib') 
        script = soup.find('script', attrs = {'id':'__NEXT_DATA__'})  
        line_str = str(script)
        first_line = '<script id="__NEXT_DATA__" type="application/json">'
        second_line = "</script>"
        idx1 = line_str.find(first_line)
        idx2 = line_str.find(second_line)
        result = line_str[idx1 + len(first_line): idx2]
        json_object = json.loads(result)
        stockFinancialData = json_object['props']['pageProps']["stockFinancialData"]
        return stockFinancialData
    except Exception as e:
        print(e)
        return stockFinancialData

### Data Structuring

In [2]:
def data_structuring(stockFinancialData):
    annualCashFlow = {}
    annualBalance = {}
    annualIncome = {}
    quarterlyCashFlow = {}
    quarterlyBalance = {}
    quarterlyIncome = {}
    for i in range(0,len(stockFinancialData)):
        #Checking if the report is Annual or not
        if stockFinancialData[i]["Type"] == "Annual":
            fiscalYear = stockFinancialData[i]["FiscalYear"]
            annualCashFlow[fiscalYear] = {}
            annualBalance[fiscalYear] = {}
            annualIncome[fiscalYear] = {}
            try:
                cashFlow = stockFinancialData[i]["stockFinancialMap"]["CAS"]
                balanceSheet = stockFinancialData[i]["stockFinancialMap"]["BAL"]
                incomeStatement = stockFinancialData[i]["stockFinancialMap"]["INC"]
                for cash in cashFlow:
                    annualCashFlow[fiscalYear][cash["key"]] = cash["value"]
                for balance in balanceSheet:
                    annualBalance[fiscalYear][balance["key"]] = balance["value"]
                for income in incomeStatement:
                    annualIncome[fiscalYear][income["key"]] = income["value"]
            except Exception as e:
                print("Skipping this year for stock",e)
        else:
            endDate = stockFinancialData[i]["EndDate"]
            quarterlyCashFlow[endDate] = {}
            quarterlyBalance[endDate] = {}
            quarterlyIncome[endDate] = {}
            # incomeStatement = []
            try:
                incomeStatement = stockFinancialData[i]["stockFinancialMap"]["INC"]
                cashFlow = stockFinancialData[i]["stockFinancialMap"]["CAS"]
                balanceSheet = stockFinancialData[i]["stockFinancialMap"]["BAL"]
                for cash in cashFlow:
                    quarterlyCashFlow[endDate][cash["key"]] = cash["value"]
                for balance in balanceSheet:
                    quarterlyBalance[endDate][balance["key"]] = balance["value"]
                for income in incomeStatement:
                    quarterlyIncome[endDate][income["key"]] = income["value"]
            except Exception as e:
                # print("Cashflow and balance not found for {}".format(endDate))
                for income in incomeStatement:
                    quarterlyIncome[endDate][income["key"]] = income["value"]
    return annualCashFlow,annualBalance,annualIncome,quarterlyCashFlow,quarterlyBalance,quarterlyIncome
            



In [3]:
characters_to_replace = ['/', '(', ')', '-']
def change_column_names(s, chars, replacement='_'):
    for char in chars:
        s = s.replace(char, replacement)
    return s

### Generate DataFrames

In [4]:
from sqlalchemy import create_engine,inspect
sql_creds = json.load(open("creds.json"))
user = sql_creds["sql_creds"]["user"]
passw = sql_creds["sql_creds"]["passw"]
host = sql_creds["sql_creds"]["host"]
port = 3306
database = 'stock_fundamentals'
# Create an engine instance
engine = create_engine(f'mysql+pymysql://{user}:{passw}@{host}:{port}/{database}')

def generate_dataframes(annual,quarterly,stockName):
    annualdf = pd.DataFrame(annual).transpose()
    annualdf["Annual/Quarter"] = "Annual"
    quarter_df = pd.DataFrame(quarterly).transpose()
    quarter_df["Annual/Quarter"] = "Quarter"
    finalDf = pd.concat([annualdf, quarter_df], axis=0)
    finalDf["stockName"] = stockName
    finalDf = finalDf.reset_index()
    modified_columns_names = [change_column_names(s, characters_to_replace) for s in list(finalDf.columns)]
    finalDf.columns = modified_columns_names
    finalDf = finalDf.rename(columns={'index': 'Year_Date'})
    return finalDf

### Preventing Duplication

In [5]:
def check_if_data_already_exists_in_dataframe(existing_df,new_df,tableName):
    dataframes = [existing_df,new_df]
    all_columns = list(set().union(*(df.columns for df in dataframes)))
    # Ensure all DataFrames have the same columns
    for i, df in enumerate(dataframes):
        for col in all_columns:
            if col not in df.columns:
                df[col] = pd.NA
        dataframes[i] = df[all_columns]
    # Concatenate all DataFrames
    combined_df = pd.concat(dataframes, ignore_index=True)
    unique_stocks = combined_df['stockName'].unique()
    df_unique = pd.DataFrame()
    for u in unique_stocks:
        temp_df = combined_df[combined_df["stockName"]== u]
        temp_df = temp_df.drop_duplicates(subset=['Year_Date'])
        df_unique = pd.concat([df_unique,temp_df], ignore_index=True)
    return df_unique

### Merge DataFrames

In [6]:
def merge_dataframes(dataframes,existing_df,tableName,flag):
    all_columns = list(set().union(*(df.columns for df in dataframes)))
    # Ensure all DataFrames have the same columns
    for i, df in enumerate(dataframes):
        for col in all_columns:
            if col not in df.columns:
                df[col] = pd.NA
        dataframes[i] = df[all_columns]
    
    # Concatenate all DataFrames
    combined_df = pd.concat(dataframes, ignore_index=True)
    if flag:
        combined_df = check_if_data_already_exists_in_dataframe(existing_df,combined_df,tableName)
    return combined_df

### Store Data

In [7]:
def get_tables_from_database():
    inspector = inspect(engine)
    tables = inspector.get_table_names()
    return tables

In [8]:
def store_data(df,tableName,flag):
    if flag:
        df.to_sql(tableName, con=engine, if_exists='replace', index=False)
    else:
        df.to_sql(tableName, con=engine, if_exists='replace', index=False)
        

### Collect Data

In [9]:
from tqdm import tqdm
livemintDetails = pd.read_csv("LiveMint.csv")[["stockLink","stockName"]]
stockLinks = list(livemintDetails["stockLink"])
stockNames = list(livemintDetails["stockName"])

db_tables = get_tables_from_database()
flag = False

df_dict = {"cashFlowDf":[],"balanceSheetDf":[],"incomeSheetDf":[]}
for i in tqdm(range(0,len(stockNames))):
    # print(i,stockNames[i])
    try:
        stockFinancialData = get_data(stockLinks[i])
        annualCashFlow,annualBalance,annualIncome,quarterlyCashFlow,quarterlyBalance,quarterlyIncome = data_structuring(stockFinancialData)
        cashFlowDf = generate_dataframes(annualCashFlow,quarterlyCashFlow,stockNames[i])
        balanceSheetDf = generate_dataframes(annualBalance,quarterlyBalance,stockNames[i])
        incomeSheetDf = generate_dataframes(annualIncome,quarterlyIncome,stockNames[i])
        df_dict["cashFlowDf"].append(cashFlowDf)
        df_dict["balanceSheetDf"].append(balanceSheetDf)
        df_dict["incomeSheetDf"].append(incomeSheetDf)
    except Exception as e:
        pass
        # print(e)

if len(db_tables) != 0:
    flag = True
    existing_cashFlowDf = pd.read_sql('SELECT * FROM {}'.format("cashflow"), con=engine)
    existing_balanceSheetDf = pd.read_sql('SELECT * FROM {}'.format("balancesheet"), con=engine)
    existing_incomeSheetDf = pd.read_sql('SELECT * FROM {}'.format("incomesheet"), con=engine)
    cashFlowDf = merge_dataframes(df_dict["cashFlowDf"],existing_cashFlowDf,"cashflow",True)
    balanceSheetDf = merge_dataframes(df_dict["balanceSheetDf"],existing_balanceSheetDf,"balancesheet",True)
    incomeSheetDf = merge_dataframes(df_dict["incomeSheetDf"],existing_incomeSheetDf,"incomesheet",True)
else:
    flag = False
    cashFlowDf = merge_dataframes(df_dict["cashFlowDf"],[],"cashflow",False)
    balanceSheetDf = merge_dataframes(df_dict["balanceSheetDf"],[],"balancesheet",False)
    incomeSheetDf = merge_dataframes(df_dict["incomeSheetDf"],[],"incomesheet",False)

store_data(cashFlowDf,"cashflow",flag)
store_data(balanceSheetDf,"balancesheet",flag)
store_data(incomeSheetDf,"incomesheet",flag)


  1%|▏         | 27/1862 [00:42<47:37,  1.56s/it]  

Skipping this year for stock 'CAS'


 25%|██▍       | 461/1862 [12:17<39:35,  1.70s/it]  

Skipping this year for stock 'CAS'


 34%|███▎      | 624/1862 [15:50<32:01,  1.55s/it]

Skipping this year for stock 'CAS'


 34%|███▎      | 627/1862 [15:55<32:44,  1.59s/it]

Skipping this year for stock 'CAS'


 43%|████▎     | 801/1862 [19:56<23:23,  1.32s/it]  

Skipping this year for stock 'CAS'


 46%|████▌     | 849/1862 [20:57<21:56,  1.30s/it]

Skipping this year for stock 'CAS'


 63%|██████▎   | 1164/1862 [28:04<15:30,  1.33s/it]

Skipping this year for stock 'BAL'


 63%|██████▎   | 1167/1862 [28:07<14:06,  1.22s/it]

Skipping this year for stock 'CAS'


 65%|██████▍   | 1205/1862 [28:56<13:44,  1.25s/it]

Skipping this year for stock 'CAS'


 70%|██████▉   | 1303/1862 [31:09<12:58,  1.39s/it]

Skipping this year for stock 'CAS'


 71%|███████   | 1318/1862 [31:28<10:40,  1.18s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 72%|███████▏  | 1335/1862 [31:47<09:43,  1.11s/it]

Skipping this year for stock 'CAS'


 73%|███████▎  | 1362/1862 [32:24<10:35,  1.27s/it]

Skipping this year for stock 'CAS'


 73%|███████▎  | 1364/1862 [32:29<18:07,  2.18s/it]

Skipping this year for stock 'CAS'


 74%|███████▍  | 1374/1862 [32:42<11:20,  1.39s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 76%|███████▋  | 1420/1862 [33:51<10:11,  1.38s/it]

Skipping this year for stock 'CAS'


 77%|███████▋  | 1429/1862 [34:09<18:37,  2.58s/it]

Skipping this year for stock 'CAS'


 81%|████████  | 1509/1862 [36:17<13:52,  2.36s/it]

Skipping this year for stock 'CAS'


 82%|████████▏ | 1524/1862 [36:44<08:40,  1.54s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 83%|████████▎ | 1545/1862 [37:16<10:41,  2.02s/it]

Skipping this year for stock 'CAS'


 84%|████████▎ | 1559/1862 [37:38<09:48,  1.94s/it]

Skipping this year for stock 'CAS'


 85%|████████▌ | 1584/1862 [38:18<09:47,  2.11s/it]

Skipping this year for stock 'CAS'


 85%|████████▌ | 1588/1862 [38:23<06:07,  1.34s/it]

Skipping this year for stock 'BAL'


 85%|████████▌ | 1591/1862 [38:28<08:16,  1.83s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 86%|████████▌ | 1595/1862 [38:41<12:47,  2.87s/it]

Skipping this year for stock 'CAS'


 86%|████████▌ | 1604/1862 [38:57<07:57,  1.85s/it]

Skipping this year for stock 'CAS'


 86%|████████▋ | 1610/1862 [39:13<09:04,  2.16s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 87%|████████▋ | 1617/1862 [39:29<12:38,  3.10s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 87%|████████▋ | 1618/1862 [39:31<10:15,  2.52s/it]

Skipping this year for stock 'CAS'


 88%|████████▊ | 1631/1862 [40:03<05:56,  1.54s/it]

Skipping this year for stock 'CAS'


 88%|████████▊ | 1637/1862 [40:12<05:40,  1.51s/it]

Skipping this year for stock 'CAS'


 90%|█████████ | 1676/1862 [41:05<04:01,  1.30s/it]

Skipping this year for stock 'CAS'


 90%|█████████ | 1677/1862 [41:06<03:53,  1.26s/it]

Skipping this year for stock 'CAS'


 91%|█████████ | 1691/1862 [41:26<04:29,  1.57s/it]

Skipping this year for stock 'CAS'


 92%|█████████▏| 1706/1862 [41:44<03:16,  1.26s/it]

Skipping this year for stock 'CAS'


 92%|█████████▏| 1714/1862 [42:02<05:06,  2.07s/it]

Skipping this year for stock 'BAL'


 93%|█████████▎| 1732/1862 [42:28<03:32,  1.63s/it]

Skipping this year for stock 'CAS'


 94%|█████████▍| 1751/1862 [42:58<02:49,  1.52s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 95%|█████████▍| 1768/1862 [43:17<01:59,  1.28s/it]

Skipping this year for stock 'CAS'


 97%|█████████▋| 1800/1862 [44:00<01:39,  1.61s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'


 97%|█████████▋| 1808/1862 [44:10<01:06,  1.23s/it]

Skipping this year for stock 'CAS'


 98%|█████████▊| 1831/1862 [44:39<00:39,  1.26s/it]

Skipping this year for stock 'CAS'


 99%|█████████▉| 1840/1862 [44:52<00:35,  1.63s/it]

Skipping this year for stock 'CAS'


 99%|█████████▉| 1848/1862 [45:03<00:16,  1.21s/it]

Skipping this year for stock 'CAS'


100%|█████████▉| 1854/1862 [45:15<00:14,  1.77s/it]

Skipping this year for stock 'BAL'


100%|██████████| 1862/1862 [45:30<00:00,  1.47s/it]

Skipping this year for stock 'CAS'
Skipping this year for stock 'CAS'





In [153]:



# Function to replace characters in a string


# Replace characters in each string in the list



In [107]:
livemintDetails.head()

Unnamed: 0,stockLink,stockName
0,https://www.livemint.com/market/market-stats/s...,State Bank Of India
1,https://www.livemint.com/market/market-stats/s...,ICICI Prudential Life Insurance Company
2,https://www.livemint.com/market/market-stats/s...,Reliance Industries
3,https://www.livemint.com/market/market-stats/s...,ICICI Bank
4,https://www.livemint.com/market/market-stats/s...,HDFC Bank


### Get Link and Names of Stocks

In [13]:
!pip install html5lib



In [10]:
# import requests 
# from bs4 import BeautifulSoup 
# import pandas as pd

# liveMintStockDetails = {"stockLink":[],"stockName":[],"stockSymbol":[]}
# for i in range(3016,5000):
#     try:
#         print(f'\rProgress: {i}/{5000}', end='', flush=True)
#         URL = "https://www.livemint.com/market/market-stats/stocks-bajaj-finance-share-price-nse-bse-s000{}".format(i) 
#         r = requests.get(URL) 
#         soup = BeautifulSoup(r.content, 'html5lib')
#         stockName_div = str(soup.find('div', attrs = {'class':'stockName'}).find('h1'))
#         stockLink_div = str(soup.find('link', attrs = {'rel':'canonical'}))
#         sub1 = "<h1>"
#         sub2 = "</h1>"
#         idx1 = stockName_div.find(sub1)
#         idx2 = stockName_div.find(sub2)
#         stockName = stockName_div[idx1 + len(sub1): idx2][:-12]
#         # print(i,stockName)
#         liveMintStockDetails["stockName"].append(stockName)
#         sub1 = '<link href="'
#         sub2 = 'rel="canonical"/>'
#         idx1 = stockLink_div.find(sub1)
#         idx2 = stockLink_div.find(sub2)
#         stockLink = stockLink_div[idx1 + len(sub1): idx2][:-2]
#         liveMintStockDetails["stockLink"].append(stockLink)
#         head = soup.head.find_all('script')[0]
#         modified_string = str(head).split(":")[-1].split("}")[0].replace('"',"")
#         liveMintStockDetails["stockSymbol"].append(modified_string)
#     except Exception as e:
#         print("Error")
# liveMintDf = pd.DataFrame(liveMintStockDetails)
# liveMintDf.to_csv("LiveMint.csv")

In [31]:
liveMintDf

Unnamed: 0,stockLink,stockName,stockSymbol
0,https://www.livemint.com/market/market-stats/s...,State Bank Of India,SBIN
1,https://www.livemint.com/market/market-stats/s...,ICICI Prudential Life Insurance Company,ICICIPRULI
2,https://www.livemint.com/market/market-stats/s...,Reliance Industries,RELIANCE
3,https://www.livemint.com/market/market-stats/s...,ICICI Bank,ICICIBANK
4,https://www.livemint.com/market/market-stats/s...,HDFC Bank,HDFCBANK
5,https://www.livemint.com/market/market-stats/s...,Yes Bank,YESBANK
6,https://www.livemint.com/market/market-stats/s...,Tata Motors,TATAMOTORS
7,https://www.livemint.com/market/market-stats/s...,Indusind Bank,INDUSINDBK
8,https://www.livemint.com/market/market-stats/s...,Indian Railway Catering &amp; Tourism Corporation,IRCTC
9,https://www.livemint.com/market/market-stats/s...,Larsen &amp; Toubro,LT


### Get Board Meeting Dates

In [12]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd
import json

In [13]:
board_meeting_dict = {'tickerId': [], 'companyName':[] , 'remarks':[] , 'boardMeetDate':[] , 'purpose':[]}
existing_data = pd.read_sql('SELECT * FROM {}'.format("boardmeetdate"), con=engine)
for i in tqdm(range(3016,5000)):
    try:
        # print(f'\rProgress: {i}/{5000}', end='', flush=True)
        URL = "https://www.livemint.com/tata-consultancy-services/board-meetings/companyid-s000{}".format(i)
        r = requests.get(URL)
        soup = BeautifulSoup(r.content, 'html5lib')
        script = soup.find('script', attrs = {'id':'__NEXT_DATA__'})  
        line_str = str(script)
        first_line = '<script id="__NEXT_DATA__" type="application/json">'
        second_line = "</script>"
        idx1 = line_str.find(first_line)
        idx2 = line_str.find(second_line)
        result = line_str[idx1 + len(first_line): idx2]
        json_object = json.loads(result)
        json_object = json.loads(result)
        stockFinancialData = json_object['props']['pageProps']
        board_meetings_schedule = stockFinancialData["additionalStockData"]["boardMeetings"]
        for schedule in board_meetings_schedule:
            board_meeting_dict["tickerId"].append(schedule["tickerId"])
            board_meeting_dict["companyName"].append(schedule["companyName"])
            board_meeting_dict["remarks"].append(schedule["remarks"])
            board_meeting_dict["boardMeetDate"].append(schedule["boardMeetDate"])
            board_meeting_dict["purpose"].append(schedule["purpose"])
    except Exception as e:
        pass
combined_df = pd.concat([pd.DataFrame(board_meeting_dict),existing_data], ignore_index=True)
combined_df.drop_duplicates(subset=["companyName","boardMeetDate","purpose"],keep="first",inplace=True)
store_data(combined_df,"boardmeetdate",True)

100%|██████████| 1984/1984 [30:38<00:00,  1.08it/s] 


Unnamed: 0,tickerId,companyName,remarks,boardMeetDate,purpose
0,S0003016,State Bank Of India,"Inter alia, to seek approval for raising of Lo...",2024-06-19,Others
1,S0003016,State Bank Of India,,2024-05-09,Audited Results & Final Dividend
2,S0003016,State Bank Of India,,2024-02-03,Quarterly Results
3,S0003016,State Bank Of India,,2023-11-04,Quarterly Results
4,S0003016,State Bank Of India,,2023-08-04,Quarterly Results
...,...,...,...,...,...
78,S0003020,HDFC Bank,,2021-10-16,Audited Results
79,S0003020,HDFC Bank,,2021-07-17,Quarterly Results
80,S0003020,HDFC Bank,,2021-06-18,Dividend
81,S0003020,HDFC Bank,,2021-04-17,Audited & Quarterly Results


In [5]:
pd.DataFrame(board_meeting_dict).to_csv("diclosure_dates_2.csv")