# Cleaning scraped WikiVoyage descriptions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import re
import pickle
from tqdm.notebook import tqdm as tqdm  # progress bar for x in tqdm(range(100))

In [2]:
# open links
with open("city_info_final.pkl", 'rb') as picklefile: 
    city_info_final = pickle.load(picklefile)

In [3]:
city_info_final.head()

Unnamed: 0,city,country,city_and_country,score,cost/month,fun,safety,quality_of_life,walkability,happiness,nightlife,wifi,friendly_to_foreigners,english,avg_trip_length_days,return_rate_percent,hotel_price_night,airbnb_price_night,visitors,link
0,buenos-aires,Argentina,buenos aires argentina,4.88,1026.0,Good,Okay,Good,Great,Good,Great,Okay,Good,Okay,25.0,16.0,34.0,24.0,"[/@krausefx, /@aczuleta, /@alexanderjoo, /@sil...",/wiki/Buenos_Aires
1,bangkok,Thailand,bangkok thailand,4.73,1522.0,Good,Good,Good,Great,Good,Great,Great,Great,Okay,7.0,18.0,31.0,51.0,"[/@remyp, /@manas, /@timrael, /@dimqen, /@dani...",/wiki/Bangkok
2,mexico-city,Mexico,mexico city mexico,4.72,1493.0,Good,Okay,Good,Great,Good,Good,Okay,Good,Okay,14.0,14.0,30.0,31.0,"[/@rohit, /@nadiaronquillo, /@evelienal, /@bri...",/wiki/Mexico_City/Santa_Fe
3,canggu,Indonesia,canggu indonesia,4.69,1389.0,Good,Great,Good,Okay,Okay,Good,Good,Great,Good,28.0,17.0,21.0,58.0,"[/@tris, /@joytravels, /@guar47, /@mariebriand...",/wiki/Canggu
4,chiang-mai,Thailand,chiang mai thailand,4.68,1126.0,Good,Great,Good,Great,Good,Okay,Good,Good,Okay,28.0,15.0,25.0,41.0,"[/@meedamian, /@zapperen, /@john, /@kymellis, ...",/wiki/Chiang_Mai


# DataFrame

In [4]:
# open links
with open("wiki_voyage_content.pkl", 'rb') as picklefile: 
    wiki_voyage_content = pickle.load(picklefile)

In [5]:
# convert to dataframe
wiki_voyage_content_df = pd.DataFrame(wiki_voyage_content)
wiki_voyage_content_df.columns=['content']

In [6]:
wiki_voyage_content_df.head()

Unnamed: 0,content
0,\n Buenos Aires is the capital of Argentina.\n...
1,\n Bangkok (Thai: กรุงเทพฯ Krung Thep) is the ...
2,\n Santa Fe is in the western area of Mexico C...
3,"\nCanggu is a beach area in South Bali, north ..."
4,\n Chiang Mai (เชียงใหม่) is the hub of northe...


# Corpus

In [7]:
corpus = wiki_voyage_content

# NLP

In [7]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec, Phrases
from gensim.summarization import summarize
from gensim.models.fasttext import FastText
from gensim.summarization.textcleaner import clean_text_by_word, get_sentences
from gensim.models.phrases import Phraser
from gensim.parsing.preprocessing import STOPWORDS

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation


In [8]:
def keep_only_letters_and_numbers(doc):
    return re.sub('[^A-Za-z0-9]+', ' ', str(doc)).strip()

In [9]:
# lower, tokenize, ignoring tokens that are too short or too long, remove accent marks
def process_doc(doc):
    return simple_preprocess(str(doc), deacc=True, min_len=2, max_len=15)

In [10]:
def lemmatize(doc):
    lem = []
    for token in doc:
        lem.append(token.lemma_ if token.lemma_ != '-PRON-' else token.lower_)
    return ' '.join(lem)

In [11]:
def remove_all_stopwords(doc, stopwords):
    tokens = [token for token in doc if token not in stopwords]
    
    # remove empty list tokens
    cleaned_tokens = [x for x in tokens if x != []]
    return cleaned_tokens

## Pipeline

In [17]:
nlp = spacy.load('en_core_web_lg')

nlp.add_pipe(lemmatize, name='lemmatize')

nlp.pipe_names

['tagger', 'parser', 'ner', 'lemmatize']

In [18]:
stopwords = pd.read_csv('news_stopwords_1k.csv')

# gensim, spacy, github stopwords
all_stopwords = list(set(list(STOP_WORDS) + list(STOPWORDS) + list(stopwords)))

In [19]:
# add the names of the cities and countries to the stopwords list
# we dont want named entities affecting the recommendations too much
for place in list(city_info_final['city_and_country']):
    all_stopwords.extend(place.split())

In [21]:
# process each doc
tokenized_corpus = []
for doc in tqdm(corpus):
    tokenized_doc = []
    tokenized_doc.append(
            remove_all_stopwords(
                nlp.pipe(
                    process_doc(
                        keep_only_letters_and_numbers(doc))), all_stopwords))
    tokenized_corpus.append(tokenized_doc)

In [38]:
# get rid of unnecessary triple list
tokenized_corpus_final = []
for i in range(len(tokenized_corpus)):
    tokenized_corpus_final.append(tokenized_corpus[i][0])

In [None]:
# save the final dataframe
with open(f'tokenized_corpus_final2.pkl', 'wb') as picklefile:
    pickle.dump(tokenized_corpus_final, picklefile)