In [1]:
# Load Requirements
import urllib3
from bs4 import BeautifulSoup
import csv
import wikipedia
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import stop_words

# Get list of Detroit attractions from TripAdvisor

In [2]:
# urls for 6 pages of results
urls = ['https://www.tripadvisor.com/Attractions-g42139-Activities-Detroit_Michigan.html#FILTERED_LIST', 
        'https://www.tripadvisor.com/Attractions-g42139-Activities-oa30-Detroit_Michigan.html#FILTERED_LIST', 
        'https://www.tripadvisor.com/Attractions-g42139-Activities-oa60-Detroit_Michigan.html#FILTERED_LIST', 
        'https://www.tripadvisor.com/Attractions-g42139-Activities-oa90-Detroit_Michigan.html#FILTERED_LIST', 
        'https://www.tripadvisor.com/Attractions-g42139-Activities-oa120-Detroit_Michigan.html#FILTERED_LIST', 
        'https://www.tripadvisor.com/Attractions-g42139-Activities-oa150-Detroit_Michigan.html#FILTERED_LIST']

# identifies where important information is within text
breakpoints = ['#1 of 157 things to do in Detroit', 
               '#31 of 157 things to do in Detroit', 
               '#61 of 157 things to do in Detroit', 
               '#91 of 157 things to do in Detroit', 
               '#121 of 157 things to do in Detroit', 
               '#151 of 157 things to do near Detroit']

# create list of attractions
attractions = []
i = 0
for url in urls:
    http = urllib3.PoolManager()
    response = http.request('GET', url)
    soup = BeautifulSoup(response.data)
    text = soup.get_text().split('\n')
    text = text[(text.index(breakpoints[i])-30):text.index(' injektReviewsContent(); ')]
    i += 1
    text = [x for x in text if x != '']
    attractions.extend(text)
    
indices = [i for i, s in enumerate(attractions) if '#taplc' in s]
attractions = [attractions[i-1] for i in indices]
attractions = [x for x in attractions if '(' not in x and ')' not in x]

# save list of attractions as csv file
with open('detroit_attractions.csv', "w") as file:
    writer = csv.writer(file, lineterminator='\n')
    for attraction in attractions:
        writer.writerow([attraction])    



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


# Get text data of each Detroit attraction

In [3]:
# Specify pages with multiple wiki search results
disambiguation = {"Old St. Mary's Church, Detroit": 'St. Mary Roman Catholic Church (Detroit)', 
                  "Tiger Stadium Site, Detroit": "Tiger Stadium (Detroit)", 
                  "Elmwood Cemetary, Detroit": "Elmwood Cemetery (Detroit, Michigan)"}

# Create dataframe of data from wiki
wiki_data = pd.DataFrame(columns=['attraction', 'url', 'summary', 'image'])

for attraction in attractions:
    search = attraction + ", Detroit"
    try:
        page = wikipedia.page(search)
        print(search, ':', page.title)
    except:
        try:
            search = disambiguation[search]
            page = wikipedia.page(search)
            print(search, ':', page.title)
        except:
            print(search)
    wiki_data = wiki_data.append({'attraction': attraction, 
                                  'url': page.url, 
                                  'summary': page.summary, 
                                  'image': page.images[0]}, ignore_index=True)

# Include data from other sites where wiki did not return results
wiki_data = wiki_data.append(pd.read_csv('manual_entry.csv', encoding='windows-1252')).reindex()
wiki_data.drop_duplicates(inplace=True)
wiki_data.reset_index(drop=True, inplace=True)

# save wiki data
wiki_data.to_csv('detroit_wiki_data.csv', index=False)
wiki_data.to_json('detroit_wiki_data.json', orient='records')

Detroit Institute of Arts, Detroit : Detroit Institute of Arts
The Guardian Building, Detroit : Guardian Building
The Ford Piquette Avenue Plant, Detroit : Ford Piquette Avenue Plant
Charles H. Wright Museum of African American History, Detroit : Charles H. Wright Museum of African American History
Motown Museum, Detroit : Hitsville U.S.A.
Fisher Building, Detroit : Fisher Building
Comerica Park, Detroit : Comerica Park
Detroit Historical Museum, Detroit : Detroit Institute of Arts




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


St. Mary Roman Catholic Church (Detroit) : St. Mary Roman Catholic Church (Detroit)
Eastern Market, Detroit : Eastern Market, Detroit
Campus Martius Park, Detroit : Campus Martius Park
Detroit RiverFront, Detroit : Detroit River
Fox Theatre, Detroit : Fox Theatre (Detroit)
Belle Isle Park, Detroit : Belle Isle Park (Michigan)
Detroit Opera House, Detroit : Detroit Opera House
Ford Field, Detroit : Ford Field
Detroit People Mover, Detroit : Detroit People Mover
Dossin Great Lakes Museum, Detroit : Dossin Great Lakes Museum
Mexicantown, Detroit : Mexicantown, Detroit
Detroit Symphony Orchestra, Detroit : Detroit Symphony Orchestra
Fisher Theatre, Detroit : Fisher Building
DNR Outdoor Adventure Center, Detroit : Detroit Zoo
Joe Louis Arena, Detroit : Joe Louis Arena
Greektown, Detroit : Greektown, Detroit
Anna Scripps Whitcomb Conservatory, Detroit : Conservatory (greenhouse)
Cobo Center, Detroit : Cobo Center
The Heidelberg Project, Detroit : Heidelberg Project
Detroit Downtown, Detroit 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


# Get text data of each attraction in general list


In [4]:
base_attractions = pd.read_csv('base_attractions.csv')

In [5]:
# Specify pages with multiple wiki search results
disambiguation = {"Hangar One": "Hangar One (Mountain View, California)",
                  "Orchard Beach": "Orchard Beach (Bronx)",
                  "La Yola Beach": "La Jolla Shores",
                  "Hana": "Hana, Hawaii",
                  "Transamerica Payramid": "Transamerica Pyramid",
                  "Sea World": "SeaWorld",
                  "Broadway": "Broadway theatre"}

# Create dataframe of data from wiki
base_wiki_data = pd.DataFrame(columns=['attraction', 'url', 'summary', 'image'])
for attraction in base_attractions['Attraction']:
    search = attraction
    try:
        page = wikipedia.page(search)
    except:
        try:
            search = disambiguation[search]
            page = wikipedia.page(search)
        except:
            print(search)
    try:
        base_wiki_data = base_wiki_data.append({'attraction': attraction, 
                                                'url': page.url, 
                                                'summary': page.summary, 
                                                'image': page.images[0]}, ignore_index=True)
    except:
        base_wiki_data = base_wiki_data.append({'attraction': attraction, 
                                                'url': page.url, 
                                                'summary': page.summary}, ignore_index=True)
        
base_wiki_data.drop_duplicates(inplace=True)
base_wiki_data.reset_index(drop=True, inplace=True)

base_wiki_data.to_csv('base_wiki_data.csv', index=False)
base_wiki_data.to_json('base_wiki_data.json', orient='records')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Cape Water Tours and Taxi
Magic City
Burlington Brewery Tour
Greenbrier Beginner Falconry Experience
Saint Augustin


In [9]:
wiki_data.to_json('detroit_wiki_data.json', orient='index')
base_wiki_data.to_json('base_wiki_data.json', orient='index')