# **STOCK MARKET CLUSTERING**

In this project we will be extracting live Stock Market data from yahoo finance. We will find similarities amongst various companies using their stock stock market prices and then cluster them into different clusters using the K-means algorithm.

Note that this is an unsuppervised machine learning problem and will use an unsuppervised machine learning technique with the help of the K-means algorithm.

NB: "pandas_datareader" extract data from variouse internet sources into a Dataframe. Curently the following sources are supported


*   Yahoo! Finance
*   Google Finance
*  St. Louis FED (FRED)
* Kenneth French's data library
* World Bank
* Google Analytics



In [None]:
!pip install pandas_datareader

In [None]:
!pip install plotly

In [None]:
from pandas_datareader import data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.simplefilter('ignore')

In [None]:
companies_dict = {
    'Amazon':'AMZN',
    'Apple':'AAPL',
    'Walgreen':'WBA',
    'Northrop Grumman':'NOC',
    'Boeing':'BA',
    'Lockheed Martin':'LMT',
    'McDonalds':'MCD',
    'Intel':'INTC',
    'Rayheon':'RTX',
    'IBM':'IBM',
    'Texas Instruments':'TXN',
    'MasterCard':'MA',
    'Microsoft':'MSFT',
    'General Electrics':'GE',
    'Tesla':"TSLA",
    'American Express':'AXP',
    'Pepsi':'PEP',
    'Coca Cola':'KO',
    'Johnson & Johnson':'JNJ',
    'Toyota':'TM',
    'Honda':'HMC',
    'L3Harris':'LHX',
    'General Dynamics':"GD",
    'Exxon':'XOM',
    'Chevron':'CVX',
    'Valero Energy':'VLO',
    'Ford':'F',
    'Bank of America':'BAC'}

In [None]:
!pip install yfinance

In [None]:
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()
from datetime import datetime
startdate = datetime(2018,7,1)
enddate = datetime(2023,7,1)
df = pdr.get_data_yahoo(list(companies_dict.values()), start=startdate, end=enddate)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
stock_open=np.array(df['Open']).T
stock_close=np.array(df['Close']).T

In [None]:
## Movements is the difference between opening and closing prices of a particular day, Positive movement suggests go long on stock(buy) and negative movement suggests to short the stock(sell)
movements=stock_close-stock_open

In [None]:
sum_of_movement=np.sum(movements,1) ## Sum of difference of closing and opening prices of all days

In [None]:
for i in range(len(companies_dict)):
  print("Company:{}, Change:{}".format(df['High'].columns[i],sum_of_movement[i]))

In [None]:
plt.figure(figsize=(25,10))

plt.subplot(1,2,1)
plt.title("Amazon", fontsize=20)
plt.xticks(fontsize=10)
plt.yticks(fontsize=20)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Opening Price',fontsize=15)
plt.plot(df['Open']['AMZN'])

plt.subplot(1,2,2)
plt.title("Apple", fontsize=20)
plt.xticks(fontsize=10)
plt.yticks(fontsize=20)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Opening Price',fontsize=15)
plt.plot(df['Open']['AAPL'])



In [None]:
## Seeing the trends for each company
for i,j in companies_dict.items():
  plt.figure(figsize=(25,10))
  plt.title(i)
  plt.xticks(fontsize=10)
  plt.yticks(fontsize=20)
  plt.xlabel('Date',fontsize=15)
  plt.ylabel('Opening Price',fontsize=15)
  plt.plot(df['Open'][j])

In [None]:
for i,j in companies_dict.items():
  plt.figure(figsize=(20,10))
  plt.title(i + " Opening vs Closing Price")
  plt.xlabel('Date')
  plt.ylabel('Price')
  plt.plot(df['Open'][j], label='Open')
  plt.plot(df['Close'][j], label='Close')
  plt.legend(loc='upper left')

In [None]:
movements_1=df['Close']-df['Open']

In [None]:
movements_1

In [None]:
for i,j in companies_dict.items():
  plt.figure(figsize=(20,18))
  plt.title(i + " Movement")
  plt.xlabel('Date')
  plt.ylabel('Movement')
  plt.plot(movements_1[j])

In [None]:
## Volume of stock traded
for i,j in companies_dict.items():
  plt.figure(figsize=(20,10))
  plt.title(i)
  plt.xlabel("Date")
  plt.ylabel("Volume")
  plt.plot(df['Volume'][j])

## Normalization
Rate of change of Stock Prices are having different scales, therefore we need to normalize our data in order to have consistency for further analysis

In [None]:
from sklearn.preprocessing import Normalizer
normalizer=Normalizer()
norm_movements=normalizer.fit_transform(movements_1)

In [None]:
norm_movements

In [None]:
norm_movements_df=pd.DataFrame(norm_movements, columns=movements_1.columns,index = movements_1.index )

In [None]:
norm_movements_df

In [None]:
## Movements across various companies after normalization
for i,j in companies_dict.items():
  plt.figure(figsize=(20,18))
  plt.title(i + " Movement")
  plt.xlabel('Date')
  plt.ylabel('Movement')
  plt.plot(norm_movements_df[j])

## Making a Pipeline

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

In [None]:
normalizer=Normalizer()

In [None]:
kmeans=KMeans(n_clusters=5, max_iter=1000, random_state=42)

In [None]:
## Making a pipleline combining our normalizer and KMeans
pipeline=make_pipeline(normalizer,kmeans)

In [None]:
## Fit Pipeline to daily stock movements created earlier
pipeline.fit(movements)

In [None]:
predictions=pipeline.predict(movements)

In [None]:
predictions

The companies and their respective classes are displayed below

In [None]:
df_1=pd.DataFrame({"Cluster":predictions,'Companies':list(companies_dict)})

In [None]:
df_1

## The Elbow Method

In [None]:
norm_data=normalizer.fit_transform(movements)

In [None]:
lst=[]
for i in range(1,11):
    kmodel=KMeans(n_clusters=i,n_init=15,max_iter=500)
    kmodel.fit(norm_data)
    lst.append(kmodel.inertia_)

In [None]:
lst

In [None]:
plt.plot(range(1,11),lst,marker='o')

We can see that 5 here is the best case scenario, hence we dont need to change our model

In [None]:
df_1[df_1['Cluster']==0]

In [None]:
df_1[df_1['Cluster']==1]

In [None]:
df_1[df_1['Cluster']==2]

In [None]:
df_1[df_1['Cluster']==3]

In [None]:
df_1[df_1['Cluster']==4]

## PCA

In [None]:
from sklearn.decomposition import PCA
# Define a normalizer
normalizer = Normalizer()
# Reduce the data
reduced_data = PCA(n_components = 2)
# Create Kmeans model
kmeans = KMeans(n_clusters = 5,max_iter = 1000)
# Make a pipeline chaining normalizer, pca and kmeans
pipeline = make_pipeline(normalizer,reduced_data,kmeans)
# Fit pipeline to daily stock movements
pipeline.fit(movements)
# Prediction
predictions = pipeline.predict(movements)
# Create dataframe to store companies and predicted labels
df2 = pd.DataFrame({'labels':predictions,'companies':list(companies_dict.keys())}).sort_values(by=['labels'],axis = 0)

In [None]:
df2

In [None]:
from sklearn.decomposition import PCA
# Reduce the data
reduced_data = PCA(n_components = 2).fit_transform(norm_data)
# Define step size of mesh
h = 0.01
# Plot the decision boundary
x_min,x_max = reduced_data[:,0].min()-1, reduced_data[:,0].max() + 1
y_min,y_max = reduced_data[:,1].min()-1, reduced_data[:,1].max() + 1
xx,yy = np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
# Obtain labels for each point in the mesh using our trained model
Z = kmeans.predict(np.c_[xx.ravel(),yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# Define color plot
cmap = plt.cm.Paired
# Plotting figure
plt.clf()
plt.figure(figsize=(10,10))
plt.imshow(Z,interpolation = 'nearest',extent=(xx.min(),xx.max(),yy.min(),yy.max()),cmap = cmap,aspect = 'auto',origin = 'lower')
plt.plot(reduced_data[:,0],reduced_data[:,1],'k.',markersize = 5)
# Plot the centroid of each cluster as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:,0],centroids[:,1],marker = 'x',s = 169,linewidths = 3,color = 'w',zorder = 10)
plt.title('K-Means clustering on stock market movements (PCA-Reduced data)')
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)
plt.show()