<a href="https://colab.research.google.com/github/Zozz98/Finance-Economics/blob/main/sandbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTS

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import yfinance as yf
import pandas_datareader as pdr
import datetime as dt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# GET DATA FROM WIKIPEDIA

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp500_list = pd.read_html(url)
sp500_list = sp500_list[0]
sp500_list = sp500_list.drop(columns=['GICS Sub-Industry','Headquarters Location', 'Date added','CIK','Founded'], axis=1)
sp500_list.rename(columns={'GICS Sector':'Sector'},inplace=True)

# LOAD CLOSE PRICES FROM WIKIPEDIA DATA

In [3]:
end = dt.datetime.now()
start = end - dt.timedelta(days=252)

ticker_list = [i for i in sp500_list['Symbol']]

for i in range(len(ticker_list)):
    if ticker_list[i] == 'BRK.B':
        ticker_list[i] = 'BRK-B'
    elif ticker_list[i] == 'BF.B':
        ticker_list[i] = 'BF-B'

In [4]:
stocks = yf.download(ticker_list, start=start, end=end)['Close']

[*********************100%%**********************]  503 of 503 completed


# LOG RETURN AND SET COLUMNS

In [5]:
log_return = np.log(stocks / stocks.shift(1)).replace(np.nan, 0)

annual_return = log_return.mean() * 252
annual_volatility = log_return.std() * np.sqrt(252)

stock_data = pd.DataFrame()
stock_data['Symbol'] = ticker_list
stock_data['Security'] = sp500_list['Security']
stock_data['Sector'] = sp500_list['Sector']
stock_data['Return'] = list(round(annual_return, 3))
stock_data['Volatility'] = list(round(annual_volatility, 3))
stock_data['Ratio'] = list(round((annual_return / annual_volatility), 3))

# VOLATILITY-RETURN SCATTER PLOT

In [6]:
px.scatter(stock_data,
           x='Volatility',
           y='Return',
           hover_data=['Sector'],
           hover_name='Symbol',
           title='S&P 500 stocks by Volatility/Return',
           color='Ratio',
           width=750,
           height=500,
           template='plotly_dark'
           ).update_layout(title_x=0.5)

# FIND NUMBER OF CLUSTERS WITH ELBOW-METHOD

In [39]:
X = stock_data[['Return', 'Volatility']]
inertia_list = []
for k in range(1,16):
    kmeans = KMeans(n_clusters=k, n_init='auto')
    kmeans.fit(X)
    inertia_list.append(kmeans.inertia_)

px.line(inertia_list,
        title='Elbow Curve',
        width=750,
        height=500,
        labels={
            "index":"Number of Clusters",
            "value":"Sum of Squared Error"
        },
        template='plotly_dark'
        ).update_layout(showlegend=False, title_x=0.5)

# KMEAN MODEL FIT

In [43]:
kmeans = KMeans(n_clusters=4, n_init='auto').fit(X)
labels = kmeans.labels_
stock_data['Cluster Label'] = labels

# PLOT STOCKS BY CLUSTERS

In [44]:
px.scatter(X,
           x='Volatility',
           y='Return',
           color=labels,
           title='KMeans Clusters',
           hover_name=stock_data['Symbol'],
           width=750,
           height=500,
           template='plotly_dark'
           ).update_layout(title_x=0.5)

In [41]:
silhouette_score(X, labels)

0.4254472495149587

In [46]:
silhouette = []

for k in range(2,16):
    kmeans = KMeans(n_clusters=k, n_init='auto')
    kmeans.fit(X)
    silhouette.append(silhouette_score(X, kmeans.labels_))

In [47]:
px.line(silhouette,
        title='Silhouette Curve',
        width=750,
        height=500,
        labels={
            "index":"Number of Clusters",
            "value":"Sum of Squared Error"
        },
        template='plotly_dark'
        ).update_layout(showlegend=False, title_x=0.5)

In [None]:
dividends = []

for i in ticker_list:
    info = yf.Ticker(i).info
    div = info.get('trailingAnnualDividendYield')
    dividends.append(div)

In [None]:
for (i,j) in zip(ticker_list, dividends):
    print(f'Ticker: {i}, DividendRate: {j}')

Ticker: MMM, DividendRate: 0.05918448
Ticker: AOS, DividendRate: 0.01756998
Ticker: ABT, DividendRate: 0.01980198
Ticker: ABBV, DividendRate: 0.038001817
Ticker: ACN, DividendRate: 0.014182601
Ticker: ATVI, DividendRate: 0.021542814
Ticker: ADM, DividendRate: 0.021467356
Ticker: ADBE, DividendRate: 0.0
Ticker: ADP, DividendRate: 0.019370753
Ticker: AES, DividendRate: 0.038407497
Ticker: AFL, DividendRate: 0.021320853
Ticker: A, DividendRate: 0.007853403
Ticker: ABNB, DividendRate: 0.0
Ticker: APD, DividendRate: 0.02250192
Ticker: AKAM, DividendRate: 0.0
Ticker: ALK, DividendRate: 0.0
Ticker: ALB, DividendRate: 0.008754543
Ticker: ARE, DividendRate: 0.04309117
Ticker: ALGN, DividendRate: 0.0
Ticker: ALLE, DividendRate: 0.01635292
Ticker: LNT, DividendRate: 0.033722937
Ticker: ALL, DividendRate: 0.03134006
Ticker: GOOGL, DividendRate: 0.0
Ticker: GOOG, DividendRate: 0.0
Ticker: MO, DividendRate: 0.086735874
Ticker: AMZN, DividendRate: 0.0
Ticker: AMCR, DividendRate: 0.05280173
Ticker: AM

In [None]:
stock_data['Dividend'] = dividends

In [None]:
no_dividend_shares = stock_data.loc[stock_data['Dividend'] == 0].count()
dividend_shares = stock_data.loc[stock_data['Dividend'] > 0].count()

print(f'Number of shares does not have Dividend: {no_dividend_shares[1]}')
print(f'Number of shares does have Dividend: {dividend_shares[1]}')

Number of shares does not have Dividend: 102
Number of shares does have Dividend: 401


In [None]:
stock_data_dividend = stock_data.loc[stock_data['Dividend'] > 0]

In [None]:
px.scatter(stock_data_dividend,
           x='Volatility',
           y='Return',
           hover_data=['Sector'],
           hover_name='Symbol',
           title='S&P 500 dividend stocks by Volatility/Return',
           color='Ratio',
           width=750,
           height=500,
           template='plotly_dark'
           ).update_layout(title_x=0.5)