# Test Case

In order to understand a little better how to utilize unsupervised machine learning algorithms we will runa test case that comes from the final project presented by out TA Soyoung An and Yisurai Du for BMI 6018 Intro to Programming

The project title is: 
### Relationship between amount of 911 calls and characteristics of townships in Montgomery County in Pennsylvania

The main goal of the project is to try to understand the factors associated to 911 calls in the Montgomery Count in Pennsylvania.

There are 4 predictors included:

Levels of education (high - low)
employment (yes - no)
race (White - others)
income (quantitative)

The data that we will use is a set already collected and prepared. The data is also normalized by the population size for each township (total of 67 townships)

In [None]:
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra)
library(dendextend)
library(psych)


calls = read.csv(file = "911calls.csv")
row.names(calls) = calls$twp



In [None]:
calls = calls[,-1]
head(calls)

## First, let's see how correlated are the variables among each other and specially with 911 calls

In [None]:
round(cor(calls),4)
pairs.panels(calls)

In [None]:
distance_calls <- get_dist(scaled_df)
fviz_dist(distance_calls, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))


Let's start by examining the data using a PCA to see if we can use PCs in order to minimize the number of dimensions.

In [None]:
####PCA###
apply(calls, 2, var)
scaled_df <- apply(calls, 2, scale)
head(scaled_df)
row.names(scaled_df) = row.names(calls)
glimpse(scaled_df)

In [None]:
round(cor(scaled_df),4)
pairs.panels(scaled_df)

In [None]:
##Should we eliminate this outlier??

In [None]:
arrests.cov <- cov(scaled_df)
arrests.eigen <- eigen(arrests.cov)
str(arrests.eigen)

(phi <- arrests.eigen$vectors[,1:4])

phi <- -phi
row.names(phi) <- c("rate_high_edu", "rate_call_911", "rate_emp", "rate_Race_w", "income")
colnames(phi) <- c("PC1", "PC2","PC3","PC4")
phi

PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]
PC3 <- as.matrix(scaled_df) %*% phi[,3]
PC4 <- as.matrix(scaled_df) %*% phi[,4]

# Create data frame with Principal Components scores
PC <- data.frame(State = row.names(scaled_df), PC1, PC2, PC3, PC4)
head(PC)

ggplot(PC, aes(PC1, PC2)) + 
  modelr::geom_ref_line(h = 0) +
  modelr::geom_ref_line(v = 0) +
  geom_point() +
  #geom_text(aes(label = State), size = 3) +
  xlab("First Principal Component") + 
  ylab("Second Principal Component") + 
  ggtitle("First Two Principal Components of USArrests Data")


In [None]:
PVE <- arrests.eigen$values / sum(arrests.eigen$values)
round(PVE, 2)

In [None]:
pca_result <- prcomp(scaled_df, scale = TRUE)
names(pca_result)

In [None]:
pca_result$x <- - pca_result$x
head(pca_result$x)

biplot(pca_result, scale = 0, cex = 0.5)

## Let's use the k-means technique to split the data based on similarities accross variables

In [None]:
###K-Means###

k3 <- kmeans(scaled_df, centers = 4, nstart = 25)
k3$cluster
PC$Kmeans = k3$cluster
PC$Kmeans = as.factor(PC$Kmeans)
clusters_one = PC[PC$Kmeans == 1,] 
clusters_two = PC[PC$Kmeans == 2,] 
clusters_three = PC[PC$Kmeans == 3,] 
clusters_four = PC[PC$Kmeans == 4,] 




In [None]:
ggplot(PC, aes(PC1, PC2,color = Kmeans)) + 
  modelr::geom_ref_line(h = 0) +
  modelr::geom_ref_line(v = 0) +
  geom_point() +
  #geom_text(aes(label = State), size = 3) +
  xlab("First Principal Component") + 
  ylab("Second Principal Component") + 
  ggtitle("First Two Principal Components of 911 call rates Montgomery County Pennsylvania")

In [None]:
scaled_df_K = cbind(scaled_df,PC$Kmeans)
scaled_df_K = as.data.frame(scaled_df_K)


high = scaled_df_K[scaled_df_K$V6==1,]
high = high[,1:5]
round(cor(high),4)
pairs.panels(high)



In [None]:
Low = scaled_df_K[scaled_df_K$V6==3,]
Low = Low[,1:5]
round(cor(Low),4)
pairs.panels(Low)

In [None]:
mid = scaled_df_K[scaled_df_K$V6==4,]
mid = mid[,1:5]
round(cor(mid),4)
pairs.panels(mid)

## Finally, let's compare the k-means approach to the hierarchichal clustering approach

In [None]:
##Hierarchical Clustering

# Dissimilarity matrix
d <- dist(scaled_df, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )

# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

In [None]:
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

# function to compute coefficient
ac <- function(x) {
  agnes(scaled_df, method = x)$ac
}

map_dbl(m, ac)

In [None]:
hc3 <- agnes(scaled_df, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes") 

In [None]:
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
rect.hclust(hc3, k = 5)

In [None]:
row.names(Low)

In [None]:
row.names(high)

In [None]:
row.names(mid)

In [None]:
fviz_cluster(list(data = scaled_df_K, cluster = scaled_df_K$V6))

In [None]:
fviz_nbclust(scaled_df, FUN = hcut, method = "wss")