Part 1

In [None]:
import json
import matplotlib.pyplot as plt
import time

t0 = time.time()

X = []
Y = []
with open('drive/MyDrive/tweets.json', 'r') as f:
  tweets = json.load(f)
  for tweet in tweets:
    X.append(tweet['lng'])
    Y.append(tweet['lat'])

plt.scatter(X, Y, s=5, c='black')

plt.legend("California Tweets")
plt.show()

t_compute = time.time() - t0
print(t_compute)

Part 2

In [None]:
!pip install haversine
from haversine import haversine

my_point = (34.212730341331145, -118.2859001567625)

counter = 0
radius = 50000
for tweet in tweets:
  point = (tweet['lat'], tweet['lng'])
  dist = haversine(my_point, point, unit='m')
  if dist <= radius:
    counter += 1

print(counter)

Part 3

In [None]:
# Setting the minimum lat & lng as origin
from haversine import haversine

x0 = 200
y0 = 200

for tweet in tweets:
  if tweet['lat'] < x0:
    x0 = tweet['lat']
  if tweet['lng'] < y0:
    y0 = tweet['lng']

# Converting lat & lng data to x & y
converted_tweets = []
for tweet in tweets:
  x = haversine((x0, 0), (tweet['lat'], 0), unit='m')
  y = haversine((0, y0), (0, tweet['lng']), unit='m')
  converted_tweets.append([x, y])

In [None]:
# KMeans
import time
from sklearn.cluster import KMeans

k_means = KMeans(init="k-means++", n_clusters=100, n_init=10)
t0 = time.time()
k_means.fit(converted_tweets)
t_batch = time.time() - t0

In [None]:
# Optimum k for KMeans
step = 100
k = 100
while k < 10000:
  k_means = KMeans(init="k-means++", n_clusters=k, n_init=10)
  t0 = time.time()
  k_means.fit(converted_tweets)
  t = time.time() - t0
  print("%d %.2f %d" % (k, t, k_means.inertia_))
  k = int(k + step)
  step = (step + 100) * 1.1

In [None]:
# Mini Batch KMeans
from sklearn.cluster import MiniBatchKMeans

mbk = MiniBatchKMeans(
  init="k-means++",
  n_clusters=100,
  batch_size=10000,
  n_init=10,
  max_no_improvement=100,
  verbose=0,
)
t0 = time.time()
mbk.fit(converted_tweets)
t_mini_batch = time.time() - t0

In [None]:
# Optimum k for Mini Batch KMeans
step = 100
k = 100
while k < 10000:
  mbk = MiniBatchKMeans(
    init="k-means++",
    n_clusters=k,
    batch_size=10000,
    n_init=10,
    max_no_improvement=100,
    verbose=0,
  )
  t0 = time.time()
  mbk.fit(converted_tweets)
  t = time.time() - t0
  print("%d %.2f %d" % (k, t, mbk.inertia_))
  k = int(k + step)
  step = (step + 100) * 1.1

In [None]:
from sklearn.metrics.pairwise import pairwise_distances_argmin
import matplotlib.pyplot as plt
import numpy as np
import random

k_means_cluster_centers = k_means.cluster_centers_
order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
mbk_means_cluster_centers = mbk.cluster_centers_[order]

k_means_labels = pairwise_distances_argmin(converted_tweets, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(converted_tweets, mbk_means_cluster_centers)

fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for j in range(6)])
  for i in range(100)]

# KMeans
converted_tweets = np.array(converted_tweets)
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(100), colors):
  my_members = k_means_labels == k
  cluster_center = k_means_cluster_centers[k]
  ax.plot(
    converted_tweets[my_members, 0],
    converted_tweets[my_members, 1],
    "w",
    markerfacecolor=col,
    marker="."
  )
  ax.plot(
    cluster_center[0],
    cluster_center[1],
    "o",
    markerfacecolor=col,
    markeredgecolor="k",
    markersize=6,
  )
ax.set_title("KMeans")
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, "train time: %.2fs" % (t_batch))

# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(100), colors):
  my_members = mbk_means_labels == k
  cluster_center = mbk_means_cluster_centers[k]
  ax.plot(
    converted_tweets[my_members, 0],
    converted_tweets[my_members, 1],
    "w",
    markerfacecolor=col,
    marker="."
  )
  ax.plot(
    cluster_center[0],
    cluster_center[1],
    "o",
    markerfacecolor=col,
    markeredgecolor="k",
    markersize=6,
  )
ax.set_title("MiniBatchKMeans")
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, "train time: %.2fs" % (t_mini_batch))

# Initialize the different array to all False
different = mbk_means_labels == 4
ax = fig.add_subplot(1, 3, 3)

for k in range(100):
  different += (k_means_labels == k) != (mbk_means_labels == k)

identic = np.logical_not(different)
ax.plot(
  converted_tweets[identic, 0],
  converted_tweets[identic, 1],
  "w",
  markerfacecolor="#bbbbbb",
  marker="."
)
ax.plot(
  converted_tweets[different, 0],
  converted_tweets[different, 1],
  "w",
  markerfacecolor="m",
  marker="."
)
ax.set_title("Difference")
ax.set_xticks(())
ax.set_yticks(())

plt.show()

In [None]:
# DBScan
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

centers = 100

db = DBSCAN(eps=1000, min_samples=100).fit(converted_tweets)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

for k, col in zip(unique_labels, colors):
  if k == -1:
    # noise color
    col = [0, 0, 0, 1]

  class_member_mask = labels == k

  xy = converted_tweets[class_member_mask & ~core_samples_mask]
  plt.plot(
    xy[:, 0],
    xy[:, 1],
    "o",
    markerfacecolor=tuple(col),
    markeredgecolor="k",
    markersize=6,
    zorder=0
  )

  xy = converted_tweets[class_member_mask & core_samples_mask]
  plt.plot(
    xy[:, 0],
    xy[:, 1],
    "o",
    markerfacecolor=tuple(col),
    markeredgecolor="k",
    markersize=14,
    zorder=5
  )

plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.show()

In [None]:
centers = 100
eps = 10
while eps <= 2000:
  t0 = time.time()
  db = DBSCAN(eps=eps, min_samples=100).fit(converted_tweets)
  core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  core_samples_mask[db.core_sample_indices_] = True
  labels = db.labels_

  # Number of clusters in labels, ignoring noise if present.
  n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  n_noise_ = list(labels).count(-1)
  t_exe = time.time() - t0
  print("%d %.2f %d %d" % (eps, t_exe, n_clusters_, n_noise_))
  eps = eps + 10

Part 4

In [None]:
import numpy as np
import random

no = 100
step = 100
while no <= 100000:
  samples = random.sample(np.array(converted_tweets).tolist(), no)
  k_means = KMeans(init="k-means++", n_clusters=100, n_init=10)
  t0 = time.time()
  k_means.fit(samples)
  t = time.time() - t0
  print("%d %.2f %d" % (no, t, k_means.inertia_))
  no = int(no + step)
  step = (step + 100) * 1.05

In [None]:
import numpy as np
import random

no = 100
step = 100
while no <= 100000:
  samples = random.sample(np.array(converted_tweets).tolist(), no)
  mbk = MiniBatchKMeans(
    init="k-means++",
    n_clusters=100,
    batch_size=int(no / 10),
    n_init=10,
    max_no_improvement=100,
    verbose=0,
  )
  t0 = time.time()
  mbk.fit(samples)
  t = time.time() - t0
  print("%d %.2f %d" % (no, t, mbk.inertia_))
  no = int(no + step)
  step = (step + 100) * 1.05

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN

no = 100
step = 100

while no <= 100000:
  samples = random.sample(np.array(converted_tweets).tolist(), no)
  t0 = time.time()
  db = DBSCAN(eps=320, min_samples=100).fit(samples)
  core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  core_samples_mask[db.core_sample_indices_] = True
  labels = db.labels_
  t_exe = time.time() - t0

  # Number of clusters in labels, ignoring noise if present.
  n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  n_noise_ = list(labels).count(-1)
  print("%d %.2f %d %d" % (no, t_exe, n_clusters_, n_noise_))

  no = int(no + step)
  step = (step + 100) * 1.05

Part 5

In [None]:
# Mini Batch KMeans
from sklearn.cluster import MiniBatchKMeans
import math

mbk = MiniBatchKMeans(
  init="k-means++",
  n_clusters=100,
  batch_size=10000,
  n_init=10,
  max_no_improvement=100,
  verbose=0,
)
t0 = time.time()
mbk.fit(converted_tweets)
t_mini_batch = time.time() - t0

def get_distance(point1, point2):
  return math.sqrt((point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2)

cluster_centers = []
my_point = [34.212730341331145, -118.2859001567625]
# Calc distances from my_point
for index, cluster_center in enumerate(mbk.cluster_centers_):
  cluster_centers.append([index, get_distance(my_point, cluster_center)])

In [None]:
# Sort clusters regarding their distance from my_point
cluster_centers.sort(key=lambda x: x[1])
sliced_clusters = cluster_centers[0: 5]
sliced_clusters_indexes = [index for index, value in sliced_clusters]
cluster_tweets = [x for index, x in enumerate(tweets) if mbk.labels_[index] in sliced_clusters_indexes]

In [None]:
used_words = []
words_count = []
redundant_chars = ['.', ',', ';', '!', '?', '\n', '\r', '"', '\'', '#', '-', '(', ')']

for tweet in cluster_tweets:
  text = tweet['text'].lower()
  for redundant_char in redundant_chars:
    text = text.replace(redundant_char, '')
  words = text.split(' ')
  for word in words:
    # Remove redundant parts from words
    if word in used_words:
      index = used_words.index(word)
      words_count[index][1] += 1
    else:
      used_words.append(word)
      words_count.append([word, 1])

words_count.sort(key=lambda x: x[1], reverse=True)
print(words_count)