In [1]:
import string
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import umap

In [2]:
pd.set_option('display.max_colwidth', 1000)

In [3]:
df = pd.read_csv('out.csv')
df.shape

(3049, 4)

# Data cleaning

In [4]:
df['content'] = df['content'].str.replace('[\n\t ]', ' ').str.replace('\s+', ' ').fillna('')

In [5]:
df.head()

Unnamed: 0,tag,depth,content,height
0,body,1,"Singapore (singapore) Bangkok, Thailand (thailand) Malaysia (malaysia) Food Travel Nightlife Videos Marketing F&B Resources Food and Beverage Consultation Food & Restaurant Marketing System: Create a Profitable restaurant How To Start A Food Blog F&B Resources About Careers Guest Post Contact Bangkok, Thailand Malaysia Food Travel Nightlife Videos Contact Us Food 17 Cheap Budget Buffets in Singapore $20 and Below Last Updated: January 23, 2018 Written by Marshall Too Shares Affordable Buffets Written By Marshall Too Categories Affordable Buffets Shares 16.1kSHARESShareTweetFree Restaurant Voucher As with every cheap budget buffet list article, there is always a catch. The prices here mostly do not include GST/service charge, so the total fee per pax might/will go above twenty dollars depending on the number of people you bring.This article works best to let you know exactly when and where to go to stretch your dollar. Stop wasting your money on overpriced buffets that just aren’t ...",13
1,div,1,,1
2,,1,,0
3,,1,,0
4,,1,,0


# Feature engineering

In [6]:
df['length'] = df['content'].str.len()

In [7]:
df['has_unit_no'] = df['content'].str.contains(r'#\d+\s{0,3}-\s{0,3}\d+').astype(int)

In [8]:
df['has_postal_code'] = df['content'].str.contains(r'(?:ingapore|S)\s*\d{6}').astype(int)

In [9]:
df['has_address_label'] = df['content'].str.contains(r'[Aa]ddr(?:ess)?:?').astype(int)

In [10]:
df['content_stripped'] = df['content'].str.translate(str.maketrans('', '', string.punctuation + string.digits)).str.replace('\s+', ' ').str.strip()

In [11]:
df['n_words_debug'] = df['content'].str.translate(str.maketrans('', '', string.punctuation)).str.replace('\s+', ' ').str.strip().str.replace(' ', '#')
df['n_words'] = df['content'].str.translate(str.maketrans('', '', string.punctuation)).str.replace('\s+', ' ').str.strip().str.count(' ')
# If count is 0, there are no words, so don't just add 1 to all rows
df.loc[df['n_words'] > 0, 'n_words'] += 1

# Preprocessing

In [12]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [13]:
preprocess = make_column_transformer(
    (StandardScaler(), ['height', 'depth', 'length', 'n_words']),
    ('passthrough', ['has_unit_no', 'has_postal_code', 'has_address_label']),
    (make_pipeline(SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='NO_TAG'),
                   OneHotEncoder()), 
     ['tag']),
    (TfidfVectorizer(stop_words='english'),
     'content_stripped')
)

In [14]:
X = preprocess.fit_transform(df)
X

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


<3049x846 sparse matrix of type '<class 'numpy.float64'>'
	with 22578 stored elements in Compressed Sparse Row format>

# Clustering

In [24]:
from sklearn.cluster import KMeans

In [25]:
kmeans = KMeans(n_clusters=2, random_state=17)

In [27]:
kmeans.fit(X)
len(kmeans.labels_)

3049

In [15]:
reducer = umap.UMAP()
embedding = reducer.fit_transform()

  n_components


In [21]:
import bokeh.plotting as bk
from bokeh.models import ColumnDataSource, HoverTool

bk.output_notebook()

In [47]:
source = ColumnDataSource(data={
    'x': embedding[:, 0],
    'y': embedding[:, 1],
    'content': df['content'],
    'colors': ['red' if l else 'blue' for l in kmeans.labels_]
})

p = bk.figure(toolbar_location=None)
p.scatter(x='x', y='y', 
          radius=0.5, 
          fill_color='colors', fill_alpha=0.4, 
          line_color=None,
          source=source)

p.add_tools(HoverTool(
    tooltips=[
        ("index", "$index"),
        ("content", "@content"),
    ],
))

bk.show(p)

In [44]:
df['content'][1401]

'Marshall Too'