<a href="https://colab.research.google.com/github/amovar18/machinelearningproject/blob/master/stock_market_clustering_alogrithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys
import sklearn
import pandas as pd
import numpy as np
from pandas_datareader import data
import matplotlib.pyplot as plt
from datetime import datetime

In [0]:
#defines the instruments to download
companies_dict={
    "Amazon":"AMZN",
    "Apple":"AAPL",
    "Walgreen":"WBA",
    "Northgrop Gruman":"NOC",
    "Boeing":"BA",
    "Lockheed Martin":"LMT",
    "McDonalds":"MCD",
    "IBM":"IBM",
    "Intel":"INTC",
    "Navistar":"NAV",
    "Texas Instruments":"TXN",
    "Microsoft":"MSFT",
    "Mastercard":"MA",
    "General Electrics":"GE",
    "Pepsi":"PEP",
    "Coca cola":"KO",
    "Sony":"SNE",
    "Misubishi":"MSBHY",
    "Exxon":"XOM",
    "Ford":"F",
    "Valero Energy":"VLO",
    "Bank of America":"BAC",
    "Chevron":"CVX",
    "Jhonson & Jhonson":"JNJ",
    "American Express":"AXP"
}
companies=sorted(companies_dict.items(),key=lambda x:x[1])
print(companies)

In [0]:
#define online source
data_source="yahoo"
#start date of data_source
start_date=datetime(2014,12,31)
end_date=datetime(2017,12,31)
#get data from source
panel_data=data.DataReader(list(companies_dict.values()),data_source,start_date,end_date)
print(panel_data.axes)

In [0]:
#data for stock movement
stock_close=panel_data['Close']
stock_open=panel_data['Open']

In [0]:
#calculate daily stock movement(gains or losses in a day)
stock_close=np.array(stock_close).T
stock_open=np.array(stock_open).T
row,col=stock_close.shape
print(row)
print(col)

In [0]:
#calculate daily stock movement
movements=np.zeros([row,col])
for i in range(0,row):
  movements[i,:]=np.subtract(stock_close[i,:],stock_open[i,:])

In [0]:
for i in range(0,len(companies)):
  print("companies: {},change:{}".format(companies[i][0],sum(movements[i][:])))

In [0]:
#visualize plot stock movement
plt.clf
plt.figure(figsize=(18,16))
ax1=plt.subplot(221)
plt.plot(movements[0][:])
plt.title(companies[0])

ax1=plt.subplot(222,sharey=ax1)
plt.plot(movements[1][:])
plt.title(companies[1])

plt.show()


In [0]:
#use scikit learn to normalize
from sklearn.preprocessing import Normalizer
normalizer=Normalizer()
new=normalizer.fit_transform(movements)

print(new.min())
print(new.max())
print(new.mean())

In [0]:
#visualize plot stock movement on normalized data
plt.clf
plt.figure(figsize=(18,16))
ax1=plt.subplot(221)
plt.plot(new[0][:])
plt.title(companies[0])

ax1=plt.subplot(222,sharey=ax1)
plt.plot(new[1][:])
plt.title(companies[1])

plt.show()


In [0]:
#import necessary libs
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
#create  normalizer
normalizer=Normalizer()
#create kmeans mcluster model
kmeans=KMeans(n_clusters=12,max_iter=1000)
#create a pipeline
pipeline=make_pipeline(normalizer,kmeans)

In [0]:
#fit pipelines to daily stock movements
pipeline.fit(movements)
print(kmeans.inertia_)

In [0]:
#predict cluster labels
labels=pipeline.predict(movements)
#create dataframe of companies and labels
dataframe=pd.DataFrame({"labels":labels,"companies":companies})
print(dataframe.sort_values('labels'))

In [0]:
from sklearn.decomposition import PCA
#visualize results on pca graph
reduced=PCA(n_components=2).fit_transform(new)
kmeans=KMeans(n_clusters=10)
kmeans.fit(reduced)
labels=kmeans.predict(reduced)

#align labels and companies
new_label=pd.DataFrame({"labels":labels,"companies":companies})
print(kmeans.inertia_)
print(new_label.sort_values('labels'))

In [0]:
#meshgrid definition
h=0.01
#Define decision boundary
xmin,xmax=reduced[:,0].min()-1,reduced[:,0].max()+1
ymin,ymax=reduced[:,1].min()-1,reduced[:,1].max()+1
xx,yy=np.meshgrid(np.arange(xmin,xmax,h),np.arange(ymin,ymax,h))
#obtain labels for mesh using predicted model
z=kmeans.predict(np.c_[xx.ravel(),yy.ravel()])
#put results into color plot
z=z.reshape(xx.shape)
cmap=plt.cm.Paired
#plot data
plt.clf()
plt.figure(figsize=(10,10))
plt.imshow(z,interpolation='nearest',extent=(xx.min(),xx.max(),yy.min(),yy.max()),cmap=cmap,aspect='auto',origin='lower')
plt.plot(reduced[:,0],reduced[:,1],'k.',markersize=5)
#plot centroids
centroids=kmeans.cluster_centers_
plt.scatter(centroids[:,0],centroids[:,1],marker='x',s=169,linewidths=3,color='w',zorder=10)
plt.title("Kmeans clustering on Stock market movements,PCA reduced data")
plt.xlim(xmin,xmax)
plt.ylim(ymin,ymax)
plt.show()