In [43]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.express as px
import sqlalchemy
from sqlalchemy import create_engine, func
from config import db_password

In [62]:
# Create engine and import SQL database

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/group_project"

engine = create_engine(db_string) 

df = pd.read_sql('provisional_database', con = engine)

df


Unnamed: 0,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote
0,2021,WMT,Walmart Inc.,Retail Trade,2300000,660520661,2.871829e+02,88360892,3.841778e+01,825859804,3.590695e+02,172196312,7.486796e+01,743101160,3.230875e+02,0
1,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000,980173304,7.551412e+02,705567297,5.435804e+02,894557137,6.891812e+02,77381297,5.961579e+01,82905570,6.387178e+01,1
2,2021,ACN,Accenture plc,Technology Services,624000,11874548,1.902972e+01,135634647,2.173632e+02,906664141,1.452987e+03,290468786,4.654948e+02,577047230,9.247552e+02,0
3,2021,UPS,"United Parcel Service, Inc.",Transportation,543000,471977569,8.692036e+02,267986240,4.935290e+02,602474640,1.109530e+03,174914891,3.221269e+02,209622062,3.860443e+02,0
4,2021,HD,"Home Depot, Inc. (The)",Retail Trade,504800,17804705,3.527081e+01,263857298,5.226967e+02,998245511,1.977507e+03,42049250,8.329883e+01,116114910,2.300216e+02,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,2021,FRT,Federal Realty Investment Trust,Finance,307,276298835,8.999962e+05,380831822,1.240495e+06,89733225,2.922906e+05,836538368,2.724881e+06,532057165,1.733085e+06,1
494,2021,PEAK,"Healthpeak Properties, Inc.",Finance,217,103844190,4.785447e+05,891775038,4.109562e+06,484213025,2.231396e+06,669440056,3.084977e+06,683590735,3.150188e+06,1
495,2021,O,Realty Income Corporation,Finance,210,717209057,3.415281e+06,59652794,2.840609e+05,544687195,2.593749e+06,104288784,4.966133e+05,932769499,4.441760e+06,0
496,2021,HST,Host Hotels,Finance,163,54576295,3.348239e+05,967390772,5.934913e+06,70187780,4.305999e+05,424928860,2.606926e+06,781929374,4.797113e+06,0


In [63]:
# Create X

X =  df.drop(columns = ["Year", "Ticker", "Company", "Sector", "netIncome", "grossProfit", "operatingIncome", "totalRevenue", "totalOperatingExpenses"])

print(X.shape)


(498, 7)


In [64]:
# Standardize the data

X_scaled = StandardScaler().fit_transform(X)

X_scaled[:5]


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name']. An error will be raised in 1.2.


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name']. An error will be raised in 1.2.



array([[16.12298173, -0.31749945, -0.23353273, -0.2138882 , -0.28817935,
        -0.18000532, -0.93390602],
       [ 8.92381009, -0.31564317, -0.23216682, -0.21307349, -0.28822881,
        -0.18043532,  1.07077156],
       [ 4.08125351, -0.31856315, -0.23304888, -0.21118844, -0.28691263,
        -0.17900725, -0.93390602],
       [ 3.49928454, -0.31519071, -0.23230215, -0.21203608, -0.28737754,
        -0.17990089, -0.93390602],
       [ 3.22482511, -0.31849872, -0.23222329, -0.20989393, -0.28815201,
        -0.1801597 , -0.93390602]])

In [65]:
# Creating PCA Model

pca = PCA(n_components = 4, random_state = 0)

X_pca = pca.fit_transform(X_scaled)

X_pca

array([[ -1.83997778,   0.49335949,  15.6225594 ,  -3.33659236],
       [ -1.31227766,   1.80543219,   8.56217601,  -1.47468378],
       [ -0.8448756 ,  -0.58224273,   3.97904635,  -0.93484705],
       ...,
       [ 12.36920851,  -1.41808238,   0.7087327 ,   4.0899682 ],
       [ 14.78363687,   1.73989261,  -0.66731103, -10.05632994],
       [ 23.5946911 ,  -2.68516366,   5.05479476,  12.85274801]])

In [66]:
# Creating PCA Dataframe

pca_df = pd.DataFrame(data = X_pca, index = X.index, columns = ["PC1", "PC2", "PC3", "PC4"])

print(pca_df.shape)

pca_df.head(10)

(498, 4)


Unnamed: 0,PC1,PC2,PC3,PC4
0,-1.839978,0.493359,15.622559,-3.336592
1,-1.312278,1.805432,8.562176,-1.474684
2,-0.844876,-0.582243,3.979046,-0.934847
3,-0.796347,-0.633952,3.415711,-0.820133
4,-0.774501,-0.658734,3.15085,-0.764064
5,-0.817147,1.271464,2.774723,-0.283901
6,-0.782811,1.234922,2.386473,-0.200662
7,-0.699925,-0.736079,2.304169,-0.593084
8,-0.68977,-0.741217,2.250744,-0.580422
9,-0.748354,1.204551,2.045411,-0.134625


In [67]:
# Create an elbow curve to find the best value for K.

inertia = []

k = list(range(1,11))

for i in k:
    
    km = KMeans(n_clusters=i, random_state=0)
    
    km.fit(pca_df)
    
    inertia.append(km.inertia_)

elbow_data = {"k": k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)
    
# Plot
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.



In [68]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=1)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)

predictions

array([0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0,
       2, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,
       0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 2,
       0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2,
       0, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2,
       2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2,
       0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0,
       0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2,
       2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0,

In [69]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = df.join(pca_df, how='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 


#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = predictions

# Print the shape of the clustered_df
print(clustered_df.shape)

clustered_df.head(10)

(498, 21)


Unnamed: 0,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,...,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote,PC1,PC2,PC3,PC4,Class
0,2021,WMT,Walmart Inc.,Retail Trade,2300000,660520661,287.182896,88360892,38.417779,825859804,...,172196312,74.867962,743101160,323.087461,0,-1.839978,0.493359,15.622559,-3.336592,0
1,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000,980173304,755.14122,705567297,543.580352,894557137,...,77381297,59.615791,82905570,63.87178,1,-1.312278,1.805432,8.562176,-1.474684,2
2,2021,ACN,Accenture plc,Technology Services,624000,11874548,19.029724,135634647,217.363216,906664141,...,290468786,465.494849,577047230,924.755176,0,-0.844876,-0.582243,3.979046,-0.934847,0
3,2021,UPS,"United Parcel Service, Inc.",Transportation,543000,471977569,869.203626,267986240,493.528987,602474640,...,174914891,322.126871,209622062,386.044313,0,-0.796347,-0.633952,3.415711,-0.820133,0
4,2021,HD,"Home Depot, Inc. (The)",Retail Trade,504800,17804705,35.27081,263857298,522.696708,998245511,...,42049250,83.298831,116114910,230.021613,0,-0.774501,-0.658734,3.15085,-0.764064,0
5,2021,KR,Kroger Company (The),Retail Trade,465000,112167809,241.221095,517021281,1111.873723,205357103,...,636805094,1369.47332,5206709,11.197224,1,-0.817147,1.271464,2.774723,-0.283901,2
6,2021,TGT,Target Corporation,Retail Trade,409000,123330629,301.54188,14266661,34.881812,243393166,...,666331808,1629.173125,810939821,1982.737949,1,-0.782811,1.234922,2.386473,-0.200662,2
7,2021,SBUX,Starbucks Corporation,Consumer Services,383000,34768125,90.778394,653198618,1705.47942,516956907,...,370143349,966.431721,265598008,693.467384,0,-0.699925,-0.736079,2.304169,-0.593084,0
8,2021,IBM,International Business Machines Corporation,Technology Services,375300,884601024,2357.050424,475644073,1267.370298,749077634,...,517978409,1380.17162,804566273,2143.795025,0,-0.68977,-0.741217,2.250744,-0.580422,0
9,2021,BRK.B,Berkshire Hathaway Inc. New,Finance,360000,416925498,1158.126383,732130382,2033.695506,361644305,...,940812557,2613.368214,703498296,1954.161933,1,-0.748354,1.204551,2.045411,-0.134625,2


In [71]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df, 
    x="PC1", 
    y="PC2", 
    z="PC3", 
    color="Class", 
    symbol="Class",
    #size="TotalCoinsMined",
    hover_name="numEmployees",
    hover_data=["numEmployees"],
    width=800)

fig.update_layout(legend=dict(x=0,y=1))

fig.show()