In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.express as px
import sqlalchemy
from sqlalchemy import create_engine, func
from config import db_password

In [28]:
# Create engine and import SQL database

db_password = 'tricarico12'

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/group_project"

engine = create_engine(db_string) 

df = pd.read_sql('final_database', con = engine)

df


Unnamed: 0,index,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote
0,0,2021,WMT,Walmart Inc.,Retail Trade,2300000.0,1.367300e+10,5.944783e+03,1.437540e+11,6.250174e+04,2.594200e+10,1.127913e+04,5.727540e+11,2.490235e+05,5.468120e+11,2.377443e+05,0
1,1,2020,WMT,Walmart Inc.,Retail Trade,2070000.0,1.351000e+10,6.526570e+03,1.388360e+11,6.707053e+04,2.694800e+10,1.301836e+04,5.591510e+11,2.701213e+05,5.322030e+11,2.571029e+05,1
2,2,2019,WMT,Walmart Inc.,Retail Trade,1863000.0,1.488100e+10,7.987654e+03,1.293590e+11,6.943586e+04,2.146800e+10,1.152335e+04,5.239640e+11,2.812475e+05,5.024960e+11,2.697241e+05,0
3,3,2018,WMT,Walmart Inc.,Retail Trade,1676700.0,6.670000e+09,3.978052e+03,1.291040e+11,7.699887e+04,2.195700e+10,1.309537e+04,5.144050e+11,3.067961e+05,4.924480e+11,2.937007e+05,0
4,4,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000.0,3.336400e+10,2.570416e+04,1.974780e+11,1.521402e+05,2.487900e+10,1.916718e+04,4.698220e+11,3.619584e+05,4.449430e+11,3.427912e+05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1927,1927,2018,HST,Host Hotels,Finance,119.0,1.087000e+09,9.147753e+06,1.601000e+09,1.347337e+07,8.130000e+08,6.841879e+06,5.554000e+09,4.674022e+07,4.741000e+09,3.989834e+07,0
1928,1928,2021,PNW,Pinnacle West Capital Corporation,Utilities,88.0,6.187200e+08,7.030909e+06,1.697217e+09,1.928656e+07,9.178510e+08,1.043012e+07,3.803835e+09,4.322540e+07,2.885984e+09,3.279527e+07,0
1929,1929,2020,PNW,Pinnacle West Capital Corporation,Utilities,79.0,5.505590e+08,6.951503e+06,1.634653e+09,2.063956e+07,8.444930e+08,1.066279e+07,3.586982e+09,4.529018e+07,2.742489e+09,3.462739e+07,0
1930,1930,2019,PNW,Pinnacle West Capital Corporation,Utilities,71.0,5.383200e+08,7.552189e+06,1.487356e+09,2.086639e+07,6.949490e+08,9.749565e+06,3.471209e+09,4.869822e+07,2.776260e+09,3.894865e+07,0


In [29]:
# Create X

X =  df.drop(columns = ["Year", "Ticker", "Company", "Sector", "netIncome", "grossProfit", "operatingIncome", "totalRevenue", "totalOperatingExpenses", "mostly_remote"])

print(X.shape)


(1932, 7)


In [30]:
# Standardize the data

X_scaled = StandardScaler().fit_transform(X)

X_scaled[:5]


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name']. An error will be raised in 1.2.


Feature names only support names that are all strings. Got feature names with dtypes: ['quoted_name']. An error will be raised in 1.2.



array([[-1.73115453, 18.4935165 , -0.25601673, -0.34736478, -0.30714274,
        -0.29646192, -0.27538494],
       [-1.72936152, 16.60433349, -0.25508002, -0.34391264, -0.30470864,
        -0.28947964, -0.26750265],
       [-1.72756851, 14.90406879, -0.2527276 , -0.34212542, -0.30680094,
        -0.28579744, -0.26236363],
       [-1.72577549, 13.37383055, -0.25918326, -0.33641088, -0.30460086,
        -0.27734215, -0.25260097],
       [-1.72398248, 10.26324966, -0.22420316, -0.27963475, -0.29610319,
        -0.25908626, -0.23261261]])

In [31]:
# Creating PCA Model

pca = PCA(n_components = 3, random_state = 0)

X_pca = pca.fit_transform(X_scaled)

X_pca

array([[-2.38496585, 14.66647212, 11.00647486],
       [-2.23024044, 13.27174762,  9.75918391],
       [-2.09484893, 12.01569592,  8.63723598],
       ...,
       [30.7220664 ,  4.07435269, -1.68023886],
       [31.89332488,  4.3447856 , -2.11385871],
       [37.69712754,  5.3640739 , -2.5843253 ]])

In [32]:
# Creating PCA Dataframe

pca_df = pd.DataFrame(data = X_pca, index = X.index, columns = ["PC1", "PC2", "PC3"])

print(pca_df.shape)

pca_df.head(10)

(1932, 3)


Unnamed: 0,PC1,PC2,PC3
0,-2.384966,14.666472,11.006475
1,-2.23024,13.271748,9.759184
2,-2.094849,12.015696,8.637236
3,-1.968273,10.885896,7.626092
4,-1.668594,8.597475,5.572142
5,-1.608834,7.804746,4.872481
6,-1.570302,7.088165,4.245838
7,-1.514595,6.447165,3.679172
8,-1.391163,4.476218,1.94998
9,-1.35319,4.096186,1.613145


In [33]:
# Create an elbow curve to find the best value for K.

inertia = []

k = list(range(1,11))

for i in k:
    
    km = KMeans(n_clusters=i, random_state=0)
    
    km.fit(pca_df)
    
    inertia.append(km.inertia_)

elbow_data = {"k": k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)
    
# Plot
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [34]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=1)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)

predictions

array([2, 2, 2, ..., 1, 1, 1])

In [38]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = df.join(pca_df, how='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 


#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = predictions

# Print the shape of the clustered_df
print(clustered_df.shape)

clustered_df

(1932, 21)


Unnamed: 0,index,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,...,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote,PC1,PC2,PC3,Class
0,0,2021,WMT,Walmart Inc.,Retail Trade,2300000.0,1.367300e+10,5.944783e+03,1.437540e+11,6.250174e+04,...,1.127913e+04,5.727540e+11,2.490235e+05,5.468120e+11,2.377443e+05,0,-2.384966,14.666472,11.006475,2
1,1,2020,WMT,Walmart Inc.,Retail Trade,2070000.0,1.351000e+10,6.526570e+03,1.388360e+11,6.707053e+04,...,1.301836e+04,5.591510e+11,2.701213e+05,5.322030e+11,2.571029e+05,1,-2.230240,13.271748,9.759184,2
2,2,2019,WMT,Walmart Inc.,Retail Trade,1863000.0,1.488100e+10,7.987654e+03,1.293590e+11,6.943586e+04,...,1.152335e+04,5.239640e+11,2.812475e+05,5.024960e+11,2.697241e+05,0,-2.094849,12.015696,8.637236,2
3,3,2018,WMT,Walmart Inc.,Retail Trade,1676700.0,6.670000e+09,3.978052e+03,1.291040e+11,7.699887e+04,...,1.309537e+04,5.144050e+11,3.067961e+05,4.924480e+11,2.937007e+05,0,-1.968273,10.885896,7.626092,2
4,4,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000.0,3.336400e+10,2.570416e+04,1.974780e+11,1.521402e+05,...,1.916718e+04,4.698220e+11,3.619584e+05,4.449430e+11,3.427912e+05,0,-1.668594,8.597475,5.572142,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1927,1927,2018,HST,Host Hotels,Finance,119.0,1.087000e+09,9.147753e+06,1.601000e+09,1.347337e+07,...,6.841879e+06,5.554000e+09,4.674022e+07,4.741000e+09,3.989834e+07,0,28.460083,3.940912,-2.293452,1
1928,1928,2021,PNW,Pinnacle West Capital Corporation,Utilities,88.0,6.187200e+08,7.030909e+06,1.697217e+09,1.928656e+07,...,1.043012e+07,3.803835e+09,4.322540e+07,2.885984e+09,3.279527e+07,0,29.534423,3.871850,-1.508284,1
1929,1929,2020,PNW,Pinnacle West Capital Corporation,Utilities,79.0,5.505590e+08,6.951503e+06,1.634653e+09,2.063956e+07,...,1.066279e+07,3.586982e+09,4.529018e+07,2.742489e+09,3.462739e+07,0,30.722066,4.074353,-1.680239,1
1930,1930,2019,PNW,Pinnacle West Capital Corporation,Utilities,71.0,5.383200e+08,7.552189e+06,1.487356e+09,2.086639e+07,...,9.749565e+06,3.471209e+09,4.869822e+07,2.776260e+09,3.894865e+07,0,31.893325,4.344786,-2.113859,1


In [37]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df, 
    x="PC1", 
    y="Year", 
    z="PC2", 
    color="Class", 
    symbol="Class",
    #size="TotalCoinsMined",
    hover_name="Company",
    hover_data=["Year"],
    width=800)

fig.update_layout(legend=dict(x=0,y=1))

fig.show()