# Libraries

In [53]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from nltk.corpus import stopwords 
import string
from textblob import TextBlob

# Data

In [4]:
#import data on cities and attractions
df = pd.read_csv('Data.csv')

# Data Cleaning

In [5]:
#fix column names
#description -> desc
#reviewers -> revs
#tcity15cd -> towncity_id
#attraction -> attr
df.columns = df.columns.str.lower().str.replace('tcity15cd', 'towncity_id').str.replace('attraction', 'attr').str.replace('description', 'desc').str.replace('reviewers', 'revs').str.replace(' ', '_')

In [6]:
#drop null rows
df.drop(axis = 0, index = [109, 110, 111], inplace = True)

In [7]:
#removing trailing white space from town/city 
df['town/city'] = [x.rstrip() for x in df['town/city']]

In [8]:
#fill missing values in reviews
#df[['revs_1', 'revs_2', 'revs_3', 'revs_4', 'revs_5', 'revs_6', 'revs_7', 'revs_8', 'revs_9']] = df[['revs_1', 'revs_2', 'revs_3', 'revs_4', 'revs_5', 'revs_6', 'revs_7', 'revs_8', 'revs_9']].fillna(0)

#change review data type to int
#df[['revs_1', 'revs_2', 'revs_3', 'revs_4', 'revs_5', 'revs_6', 'revs_7', 'revs_8', 'revs_9']] = df[['revs_1', 'revs_2', 'revs_3', 'revs_4', 'revs_5', 'revs_6', 'revs_7', 'revs_8', 'revs_9']].astype('int64')

#create total reviews column
#df['total_revs'] = df.revs_1 + df.revs_2 + df.revs_3 + df.revs_4 + df.revs_5 + df.revs_6 + df.revs_7 + df.revs_7 + df.revs_9

In [9]:
#drop review columns since they will not be used
df.drop(columns = ['revs_1', 'revs_2', 'revs_3', 'revs_4', 'revs_5', 'revs_6', 'revs_7', 'revs_8', 'revs_9'], inplace=True)

#drop ratings columns since they will not be used
df.drop(columns = ['rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5', 'rating_6', 'rating_7', 'rating_8', 'rating_9'], inplace=True)

In [10]:
#fix naming error
df.rename(columns = {'attr_6.1':'attr_7'}, inplace = True)

In [11]:
#set data type to string so the function will concat
df[['attr_1', 'desc_1', 'attr_2', 'desc_2', 'attr_3', 'desc_3', 'attr_4', 'desc_4', 'attr_5', 'desc_5', 'attr_6', 'desc_6', 'attr_7', 'desc_7', 'attr_8', 'desc_8', 'attr_9', 'desc_9']] = df[['attr_1', 'desc_1', 'attr_2', 'desc_2', 'attr_3', 'desc_3', 'attr_4', 'desc_4', 'attr_5', 'desc_5', 'attr_6', 'desc_6', 'attr_7', 'desc_7', 'attr_8', 'desc_8', 'attr_9', 'desc_9']].astype('str')

In [12]:
#change tags data type to string
df.tags = df.tags.astype('str')

In [13]:
#replace nulls in the descriptions and attractions
df[['attr_1', 'desc_1', 'attr_2', 'desc_2', 'attr_3', 'desc_3', 'attr_4', 'desc_4', 'attr_5', 'desc_5', 'attr_6', 'desc_6', 'attr_7', 'desc_7', 'attr_8', 'desc_8', 'attr_9', 'desc_9']] = df[['attr_1', 'desc_1', 'attr_2', 'desc_2', 'attr_3', 'desc_3', 'attr_4', 'desc_4', 'attr_5', 'desc_5', 'attr_6', 'desc_6', 'attr_7', 'desc_7', 'attr_8', 'desc_8', 'attr_9', 'desc_9']].fillna('none')

# Data Preparation

In [14]:
#create simple id column for use in model
df['simple_id'] = range(1,110)

In [57]:
#create stopwords
stpwrd = stopwords.words('english')
stpwrd.extend('none')
stpwrd.extend(string.punctuation)

In [25]:
# create keywords function
def get_keywords(df):
    '''Concats the strings from the attraction columns to form a list of keywords'''
    keywords = []
    for i in range(0, df.shape[0]):
        keywords.append(df['attr_1'][i] + ' ' + df['attr_2'][i] + ' ' + df['attr_3'][i] + ' ' + df['attr_4'][i] + ' ' + df['attr_5'][i] + ' ' + df['attr_6'][i] + ' ' + df['attr_7'][i] + ' ' + df['attr_8'][i] + ' ' + df['attr_9'][i] + ' ' + df['desc_1'][i] + ' ' + df['desc_2'][i] + ' ' + df['desc_3'][i] + ' ' + df['desc_4'][i] + ' ' + df['desc_5'][i] + ' ' + df['desc_6'][i] + ' ' + df['desc_7'][i] + ' ' + df['desc_8'][i] + ' ' + df['desc_9'][i])

    return keywords

## Testing new function with nlp

In [50]:
def get_keywords(df):
    '''Concats the strings from the attraction columns to form a list of keywords'''
    keywords = []
    
    for i in range(0, df.shape[0]):
        strings = df['attr_1'][i] + ' ' + df['attr_2'][i] + ' ' + df['attr_3'][i] + ' ' + df['attr_4'][i] + ' ' + df['attr_5'][i] + ' ' + df['attr_6'][i] + ' ' + df['attr_7'][i] + ' ' + df['attr_8'][i] + ' ' + df['attr_9'][i] + ' ' + df['desc_1'][i] + ' ' + df['desc_2'][i] + ' ' + df['desc_3'][i] + ' ' + df['desc_4'][i] + ' ' + df['desc_5'][i] + ' ' + df['desc_6'][i] + ' ' + df['desc_7'][i] + ' ' + df['desc_8'][i] + ' ' + df['desc_9'][i]
        strings = ''.join(str(word) for word in strings)
        strings = TextBlob(strings)
        strings = strings.words
        strings = strings.lemmatize()
        strings = strings.lower()

        keywords = list(strings)

    print(keywords)


In [51]:
get_keywords(df)

['york', 'minster', 'national', 'railway', 'museum', 'york', 'jorvik', 'viking', 'centre', 'shambles', 'clifford', "'s", 'tower', 'york', 'york', 'castle', 'museum', 'yorkshire', 'museum', 'york', 'city', 'walls', 'york', "'s", 'chocolate', 'story', 'iconic', 'gothic-style', 'medieval', 'cathedral', 'vintage', 'train', 'exhibition', 'activity', 'time', 'capsule-style', 'viking', 'attraction', 'historic', 'street', 'with', 'medieval', 'building', 'castle', 'keep', 'once', 'used', 'a', 'prison', 'mint', 'set', 'around', 'a', 'recreated', 'victorian', 'street', 'buried', 'treasure', 'natural', 'history', 'ruins', 'of', 'ancient', 'stone', 'structure', 'guided', 'tour', 'through', 'chocolate', 'history']


In [27]:
#create keywords column for use in model
df['keywords'] = get_keywords(df)