## Clustering

### Clustering with k-means

Check out this cool demo site: <https://www.naftaliharris.com/blog/visualizing-k-means-clustering/>

In [1]:
# Load data (on course website)
import pandas as pd
df = pd.read_csv('simplewiki.csv')
df

Unnamed: 0,title,text
0,April,April is the fourth month of the year in the J...
1,August,August (Aug.) is the eighth month of the year ...
2,Art,Art is a creative activity and technical skill...
3,A,A or a is the first letter of the English alph...
4,Air,Air is the Earth's atmosphere. Air is a mixtur...
...,...,...
10139,Depleted uranium,Depleted uranium is what is left over after ur...
10140,Commuting,Commuting is the act of travelling from home t...
10141,Anne Redpath,Anne Redpath (1895 – 1965) was a Scottish arti...
10142,CD-RW,A CD-RW (which stands for Compact Disc ReWrita...


In [None]:
text2 = df.text.str.lower()
text2 = text2.str.replace(r'[^a-z ]', ' ', regex=True)
text2

In [None]:
df['text2'] = text2

In [None]:
all_text = ' '.join(df.text2)
words = []
for word in all_text.split(' '):
    if len(word) > 2:
        words.append(word)
print(len(words))

In [None]:
from collections import Counter
word_counts = Counter(words)
word_counts.most_common(50)  # First 30 or so are stopwords

In [None]:
interesting_words = word_counts.most_common(100)[30:]
interesting_words

In [None]:
interesting_words = [w[0] for w in interesting_words]
interesting_words

In [None]:
features = []
for word in interesting_words:
    features.append('prop_' + word)
features

In [None]:
for f in features:
    df[f] = 0.0
df

In [None]:
for i, article in enumerate(df.text2):  # Takes about 30s
    for word in interesting_words:
        df.at[i, 'prop_' + word] = article.count(word) / len(article)

In [None]:
# Need to calculate proportions because otherwise articles cluster by length
# Also ensures a nice scale for distance calculation
df

In [None]:
from sklearn import cluster

model = cluster.KMeans(2)
model.fit(df[features])
clusters = model.predict(df[features])
# And fit_predict

clusters
clusters[:100]

In [None]:
# How can we visualize such high-dimensional clustering data?
# Try find which words differentiate clusters most
corrs = []
for feature in features:
    corrs.append(abs(df[feature].corr(pd.Series(clusters))))
    # Correlation between the feature's values and the clusters 
    # Converts to a Pandas Series for easy correlation computation
corrs = pd.Series(corrs, index=features)
corrs.sort_values(ascending=False)

In [None]:
# Examples from a cluster
# We can use clusters as index since it is same length as df
df[clusters == 1]

In [None]:
# Explore k
from sklearn import metrics
import matplotlib.pyplot as plt

distortions = []
silhouettes = []
for k in range(1, 11):
    print('K =', k)
    m = cluster.KMeans(k, n_init=10)
    m.fit(df[features])
    distortions.append(m.inertia_)
    clusters = m.predict(df[features])
    if k > 1:  # Not possible with k < 2
        silhouettes.append(metrics.silhouette_score(df[features], clusters))

print('Distortions:', distortions)
print('Silhouettes:', silhouettes)

In [None]:
# Plot the result; no clear K 
#   -- maybe 3 or 7
#   -- maybe much more!

# Rescaling distortions not actually needed here because the data
# are already on a 0-1 scale and the distortion comes out similar
# rescaled_distortion = pd.Series(distortions) / max(distortions)
plt.figure(dpi=300, figsize=(8, 4))
plt.xlabel('Number of clusters (k)')
plt.plot(range(1, 11), distortions, marker='s', label='Distortion')
plt.plot(range(2, 11), silhouettes, label='Silhouette')  # note 2!
plt.legend()

### Hierarchical clustering

In [None]:
# Ward linkage is default
# Picks point that minimizes SSE
model = cluster.AgglomerativeClustering(2)
model.fit(df[features])
# No `predict` method; does not lend itself well to prediction
# New data points, we have to re-fit the AgglomerativeClustering model on the combined dataset (old + new data).
model.labels_

In [None]:
kmeansmodel = cluster.KMeans(2, n_init=10)
kmeansmodel.fit(df[features])
# k-means "predict" as new data points can be assigned to clusters based on nearest centroid.
kmeanscluster = kmeansmodel.predict(df[features])

# Ward linkage makes very similar clusters here
pd.Series(model.labels_).corr(pd.Series(kmeanscluster))

In [None]:
# But complete linkage (max dist) does not
model = cluster.AgglomerativeClustering(2, linkage='complete')
model.fit(df[features])
pd.Series(model.labels_).corr(pd.Series(clusters))

# Check dendrogram plotting 

<https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html>

## Self-supervised learning / Language Modeling

Another type of unsupervised learning.

In [None]:
all_words = all_text.split(' ')
len(all_words)  # Too many!

In [None]:
common_words = word_counts.most_common(500)
common_words

#rank words
word_num = {}
for i, wordcount in enumerate(common_words):
    word_num[wordcount[0]] = i
    # wordcount[0] is just the word, wordcount[1] is the Term Frequency
word_num

In [None]:
encoded_data = []
for word in all_words[:200_000]:
    if word in word_num:  # Common word
        encoded_data.append(word_num[word])
    else:
        encoded_data.append(-1)
encoded_data

In [None]:
instances = []
y = []
for i in range(5, len(encoded_data)):
    instances.append(encoded_data[i - 5:i])
    y.append(encoded_data[i])
word_features = ['word1', 'word2', 'word3', 'word4', 'word5']
encoded_df = pd.DataFrame(data=instances, columns=word_features)
encoded_df
encoded_df['y'] = y
encoded_df

In [None]:
from sklearn import ensemble, model_selection

train_X, test_X, train_y, test_y = model_selection.train_test_split(
    encoded_df[word_features], encoded_df.y, test_size=.2)

model = ensemble.RandomForestClassifier(100, min_samples_leaf=5)
model.fit(train_X, train_y)  # About <2 minutes
preds = model.predict(test_X)
print('Kappa:', metrics.cohen_kappa_score(test_y, preds))