In [2]:
import urllib3
from bs4 import BeautifulSoup
import csv
import wikipedia
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import stop_words

In [3]:
base_data = pd.read_csv("base_wiki_data.csv")
detroit_data = pd.read_csv("detroit_wiki_data.csv")

In [4]:
def train_model(detroit_data):
    X = detroit_data["summary"]
    
    TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

    cluster_pipeline = Pipeline([
        ('vectorize', TfidfVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                      ngram_range=(1, 2), 
                                      stop_words=stop_words.ENGLISH_STOP_WORDS)), 
        ('cluster', KMeans(n_clusters=7, random_state=22))
    ])

    cluster_pipeline.fit(X)
    
    return(cluster_pipeline)

In [5]:
def predict_cluster(attraction_row, cluster_pipeline):
    X = attraction_row["summary"]
    
    return(cluster_pipeline.predict(X))

In [6]:
detroit_data_w_clusters = detroit_data.copy()

cluster_pipeline = train_model(detroit_data)
detroit_data_w_clusters['cluster'] = predict_cluster(detroit_data, cluster_pipeline)

sample_attraction = base_data.sample(1)
print('random attraction:', sample_attraction['attraction'].values)

selected_cluster = predict_cluster(sample_attraction, cluster_pipeline)
detroit_data_w_clusters.loc[detroit_data_w_clusters['cluster']==selected_cluster[0], 'attraction']

random attraction: ['Albuquerque Sunrise Balloon Rides']


3      Charles H. Wright Museum of African American H...
11                                    Detroit RiverFront
17                             Dossin Great Lakes Museum
18                                           Mexicantown
21                          DNR Outdoor Adventure Center
24                    Anna Scripps Whitcomb Conservatory
27                                      Detroit Downtown
29                                 The Spirit of Detroit
31                                Detroit-Windsor Tunnel
32                                  The Fillmore Detroit
33                                   The Redford Theatre
38                                     Two James Spirits
39                               Michigan Science Center
40                                        Masonic Temple
41                                             The Z Lot
44                                   Sound Board Theater
45                              MGM Grand Detroit Casino
52                             

In [7]:
sample_attraction

Unnamed: 0,attraction,url,summary,image
96,Albuquerque Sunrise Balloon Rides,https://en.wikipedia.org/wiki/Albuquerque_Inte...,The Albuquerque International Balloon Fiesta i...,https://upload.wikimedia.org/wikipedia/commons...


In [8]:
cluster_pipeline.predict(sample_attraction['summary'])

array([4], dtype=int32)

In [9]:
sample_attraction['summary']

96    The Albuquerque International Balloon Fiesta i...
Name: summary, dtype: object

In [10]:
base_data_w_clusters = base_data.copy()
base_data_w_clusters['cluster'] = predict_cluster(base_data_w_clusters, cluster_pipeline)

predict_cluster(base_data, cluster_pipeline)

array([4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 0, 4, 4, 4, 3, 4, 4, 4, 3, 4,
       4, 4, 4, 4, 4, 0, 3, 4, 4, 4, 4, 4, 4, 2, 4, 3, 3, 4, 4, 4, 4, 2,
       4, 4, 0, 4, 3, 4, 4, 4, 4, 2, 2, 3, 2, 6, 2, 2, 2, 2, 2, 2, 2, 4,
       4, 4, 3, 4, 4, 2, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 3, 4, 4,
       4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4,
       4, 4, 3, 4, 3, 3, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 3, 4, 4, 4, 4, 3, 4, 4, 0, 4, 4, 4, 4, 4, 3, 4], dtype=int32)

In [11]:
base_data_w_clusters.loc[base_data_w_clusters['cluster'] == 4, 'attraction']

0                                          Time Square
2                                        Union Station
3                                      Las Vegas Strip
4                               Grand Central Terminal
5                   "Magic Kingdom, Walt Disney World"
6                                    Disneyland Resort
7                                   Golden Gate Bridge
8                             Faneuil Hall Marketplace
9                                     Golden Gate Park
11                          "Epcot, Walt Disney World"
13        "Disney's Animal Kingdom, Walt Disney World"
14     "Disney's Hollywood Studios, Walt Disney World"
15                 Great Smoky Mountains National Park
17                                     Mackinac Bridge
18                                           Navy Pier
19                        "Pier 39, Fisherman's Wharf"
21                                Cranbrook Art Museum
22                             Grand Rapids Art Museum
23        

# With Loop

In [12]:
base_data = pd.read_csv("base_wiki_data.csv")
detroit_data = pd.read_csv("detroit_wiki_data.csv")

In [13]:
recommendations = detroit_data.copy()

while len(recommendations) > 15:
    cluster_model = train_model(recommendations)
    
    recommendations_w_clusters = recommendations
    recommendations_w_clusters['cluster'] = predict_cluster(recommendations, cluster_model) 
    
    random_location = recommendations_w_clusters.sample(1) # This would happen in java
    
    recommendations = recommendations_w_clusters.loc[recommendations_w_clusters['cluster'] == random_location['cluster'].values[0], 
                                                     ['attraction', 'url', 'summary', 'image']].copy()
    len(recommendations)

In [14]:
recommendations

Unnamed: 0,attraction,url,summary,image
77,Woodlawn Cemetery,https://en.wikipedia.org/wiki/Woodlawn_Cemeter...,Woodlawn Cemetery is a cemetery located at 199...,https://upload.wikimedia.org/wikipedia/commons...
86,Elmwood Cemetary,https://en.wikipedia.org/wiki/Elmwood_Cemetery...,Elmwood Cemetery in Detroit is one of Michigan...,https://upload.wikimedia.org/wikipedia/commons...
87,PuppetART Theater,https://en.wikipedia.org/wiki/Elmwood_Cemetery...,Elmwood Cemetery in Detroit is one of Michigan...,https://upload.wikimedia.org/wikipedia/commons...
88,Dabl's Gallery,https://en.wikipedia.org/wiki/Elmwood_Cemetery...,Elmwood Cemetery in Detroit is one of Michigan...,https://upload.wikimedia.org/wikipedia/commons...
100,Mount Hazel Cemetery,https://en.wikipedia.org/wiki/M-102_(Michigan_...,M-102 is an east–west state trunkline highway ...,https://upload.wikimedia.org/wikipedia/commons...
