In [12]:
import urllib3
from bs4 import BeautifulSoup
import csv
import wikipedia
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import stop_words

In [13]:
base_data = pd.read_csv("base_wiki_data.csv")
detroit_data = pd.read_csv("detroit_wiki_data.csv")

In [14]:
def train_model(detroit_data):
    X = detroit_data["summary"]
    
    TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

    cluster_pipeline = Pipeline([
        ('vectorize', TfidfVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                      ngram_range=(1, 2), 
                                      stop_words=stop_words.ENGLISH_STOP_WORDS)), 
        ('cluster', KMeans(n_clusters=7, random_state=22))
    ])

    cluster_pipeline.fit(X)
    
    return(cluster_pipeline)

In [15]:
def predict_cluster(attraction_row, cluster_pipeline):
    X = attraction_row["summary"]
    
    return(cluster_pipeline.predict(X))

In [16]:
detroit_data_w_clusters = detroit_data.copy()

cluster_pipeline = train_model(detroit_data)
detroit_data_w_clusters['cluster'] = predict_cluster(detroit_data, cluster_pipeline)

sample_attraction = base_data.sample(1)
print('random attraction:', sample_attraction['attraction'].values)

selected_cluster = predict_cluster(sample_attraction, cluster_pipeline)
detroit_data_w_clusters.loc[detroit_data_w_clusters['cluster']==selected_cluster[0], 'attraction']

random attraction: ['"Magic Kingdom, Walt Disney World"']


3      Charles H. Wright Museum of African American H...
6                                          Comerica Park
11                                    Detroit RiverFront
13                                       Belle Isle Park
16                                  Detroit People Mover
18                                           Mexicantown
21                          DNR Outdoor Adventure Center
22                                       Joe Louis Arena
25                                           Cobo Center
27                                      Detroit Downtown
28                                 GM Renaissance Center
29                                 The Spirit of Detroit
30                                  Little Caesars Arena
31                                Detroit-Windsor Tunnel
32                                  The Fillmore Detroit
33                                   The Redford Theatre
35             William G. Milliken State Park and Harbor
37                             

In [17]:
recommendations['image'].sample(1)

NameError: name 'recommendations' is not defined

In [18]:
cluster_pipeline.predict(sample_attraction['summary'])

array([2], dtype=int32)

In [19]:
sample_attraction['summary']

5    In Disney theme parks, the utilidor system is ...
Name: summary, dtype: object

In [20]:
base_data_w_clusters = base_data.copy()
base_data_w_clusters['cluster'] = predict_cluster(base_data_w_clusters, cluster_pipeline)

predict_cluster(base_data, cluster_pipeline)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5,
       5, 2, 2, 2, 2, 1, 4, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 5, 2, 2, 2, 5, 2,
       2, 2, 2, 2, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 4, 2, 2, 1, 2, 2, 2, 2, 2, 5, 2], dtype=int32)

In [21]:
base_data_w_clusters.loc[base_data_w_clusters['cluster'] == 4, 'attraction']

10           Balboa Park
28        Villa Montalvo
64         Tiger Stadium
100      Multnomah Falls
125              Lahaina
160    Graceland Mansion
Name: attraction, dtype: object

# With Loop

In [22]:
base_data = pd.read_csv("base_wiki_data.csv")
detroit_data = pd.read_csv("detroit_wiki_data.csv")

In [23]:
recommendations = detroit_data.copy()

while len(recommendations) > 15:
    cluster_model = train_model(recommendations)
    
    recommendations_w_clusters = recommendations
    recommendations_w_clusters['cluster'] = predict_cluster(recommendations, cluster_model) 
    
    random_location = recommendations_w_clusters.sample(1) # This would happen in java
    
    recommendations = recommendations_w_clusters.loc[recommendations_w_clusters['cluster'] == random_location['cluster'].values[0], 
                                                     ['attraction', 'url', 'summary', 'image']].copy()
    len(recommendations)

In [26]:
recommendations['image'].sample(1).values[0]

'https://upload.wikimedia.org/wikipedia/commons/b/b5/BagleyMemorialFountainDetroit.jpg'