# Cleaning Data

In [56]:
import numpy as np
import scipy 
import seaborn as sns
import pandas as pd
import patsy
import matplotlib
import matplotlib.pyplot as plt
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import requests
import random
import sys
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.style.use('fivethirtyeight')
from IPython.display import display
from bs4 import BeautifulSoup


In [132]:
# import raw data

users_df = pd.read_csv('/Users/caitlinmowdy/Desktop/DSI-SF-2-caitlinmowdy/capstone-hostelworld/raw-data/raw_users.csv')
revs_df = pd.read_csv('/Users/caitlinmowdy/Desktop/DSI-SF-2-caitlinmowdy/capstone-hostelworld/raw-data/raw_revs.csv')
hostels_df = pd.read_csv('/Users/caitlinmowdy/Desktop/DSI-SF-2-caitlinmowdy/capstone-hostelworld/raw-data/raw_hostels.csv')

In [216]:
# look at data

display("users_df.head(2)",users_df.head(2),'revs_df.head(2)',revs_df.head(2),
        'hostels_df.head(2)',hostels_df.head(2))

'users_df.head(2)'

Unnamed: 0,user_id,num_revs,nationality,group,age
0,2204119,1,Spain,,
1,3765998,1,Northern Ireland,,


'revs_df.head(2)'

Unnamed: 0.1,Unnamed: 0,text,user_id,score,hostel,location,link,country,city
0,2,"Overall the hostel was great and nice, if a li...",2384221,8.0,Hostel Bell,"Prague, Czech Republic",http://www.hostelworld.com/hosteldetails.php/H...,Czech Republic,Prague
1,3,"There is free internet computer, near the metr...",2378241,9.7,Beau Site,"Brussels, Belgium",http://www.hostelworld.com/hosteldetails.php/B...,Belgium,Brussels


'hostels_df.head(2)'

Unnamed: 0,rating,desc,FREE,GENERAL,SERVICES,FOOD_DRINK,ENTERTAINMENT,POLICIES,hostel
0,8.4,"[Please confirm your check in time with us., ,...","[Linen Included , Free City Maps , Towels Incl...","[Security Lockers, Breakfast Not Included, Hot...","[Internet Access, Luggage Storage, ]","[Tea & Coffee Making Facilities, ]",[],"[No Curfew, Non Smoking, Taxes Included,]",Hostel Bell
1,8.5,[Le Beau Site is a welcoming alternative for t...,"[Free Breakfast , Free City Maps , Free WiFi ,...","[Wheelchair Friendly, Elevator, Adaptors, Cots...","[Internet Access, Luggage Storage, Direct Dial...","[Minibar, ]",[],"[Child Friendly, Pet Friendly, Taxes Not Inclu...",Beau Site


In [154]:
# defs to remove tags and ugly things

def no_space_n(value):
    return value.replace('\n','').replace('  ','') 
def no_r(value):
    return value.replace('\r','') 
def no_n(value):
    return value.replace('\n','') 
def no_nc(value):
    return value.replace('\n,','').replace(',\n','') 
def add_c(value):
    return value.replace('  ',',')
def del_ec(value):
    return value.replace(',,,','').replace(',,',',')
def del_ec2(value):
    return value.replace(',,',',').replace(',,,',',').replace(',,',',')

def soupit(value):
    soup = BeautifulSoup(str(value))
    value = ''.join(soup.findAll(text=True))
    return value

def no_uni(value):
    return value.decode('unicode_escape').encode('ascii','ignore')

# Clean Hostels_DF

In [156]:
# clean hostel df

hostels_df['FREE'] = hostels_df['FREE'].map(no_space_n)
hostels_df['desc'] = hostels_df['desc'].map(no_space_n).map(no_r)
hostels_df['GENERAL'] = hostels_df['GENERAL'].map(soupit).map(add_c).map(del_ec).map(no_nc).map(no_uni)
hostels_df['SERVICES'] = hostels_df['SERVICES'].map(soupit).map(add_c).map(del_ec).map(no_nc) #.map(no_uni)
hostels_df['FOOD_DRINK'] = hostels_df['FOOD_DRINK'].map(soupit).map(add_c).map(del_ec).map(no_nc) #.map(no_uni)
hostels_df['ENTERTAINMENT'] = hostels_df['ENTERTAINMENT'].map(soupit).map(add_c).map(del_ec).map(no_nc).map(no_uni)
hostels_df['POLICIES'] = hostels_df['POLICIES'].map(soupit).map(add_c).map(del_ec2).map(no_nc).map(no_uni)

# split location

# hostels_df['country'] = [a.split(",")[1] for a in hostels_df.location]
# hostels_df['city'] = [a.split(",")[0] for a in hostels_df.location]

# del unnamed col

del hostels_df['Unnamed: 0']

In [226]:
# look at cleanded hostels_df

hostels_df.head(2)

Unnamed: 0,rating,desc,FREE,GENERAL,SERVICES,FOOD_DRINK,ENTERTAINMENT,POLICIES,hostel
0,8.4,"[Please confirm your check in time with us., ,...","[Linen Included , Free City Maps , Towels Incl...","[Security Lockers, Breakfast Not Included, Hot...","[Internet Access, Luggage Storage, ]","[Tea & Coffee Making Facilities, ]",[],"[No Curfew, Non Smoking, Taxes Included,]",Hostel Bell
1,8.5,[Le Beau Site is a welcoming alternative for t...,"[Free Breakfast , Free City Maps , Free WiFi ,...","[Wheelchair Friendly, Elevator, Adaptors, Cots...","[Internet Access, Luggage Storage, Direct Dial...","[Minibar, ]",[],"[Child Friendly, Pet Friendly, Taxes Not Inclu...",Beau Site


In [223]:
# save cleaned hostels

hostels_df.to_csv(path_or_buf='/Users/caitlinmowdy/Desktop/DSI-SF-2-caitlinmowdy/capstone-hostelworld/clean-data/clean_hostels1.csv',encoding='utf8')

# Cleaning Users_DF

In [180]:
# clean about users 

users_df.about = users_df.about.map(no_space_n)

In [182]:
# split about into 3 columns 

users_df['nationality'] = [a.split(",")[0] for a in users_df.about]
users_df['group'] = [a.split(",")[1] if len(a.split(",")) == 3 else np.nan for a in users_df.about]
users_df['age'] = [a.split(",")[2] if len(a.split(",")) == 3 else np.nan for a in users_df.about]

# del about and unnamed col

del users_df['about']
del users_df['Unnamed: 0']

In [187]:
# look at cleaned users_df

users_df1.head(2)

Unnamed: 0,user_id,num_revs,nationality,group,age
0,2204119,1,Spain,,
1,3765998,1,Northern Ireland,,


In [224]:
# save cleaned users

users_df.to_csv(path_or_buf='/Users/caitlinmowdy/Desktop/DSI-SF-2-caitlinmowdy/capstone-hostelworld/clean-data/clean_users1.csv',encoding='utf8')

# Cleaning Rev_DF

In [217]:
# split location 

revs_df['country'] = [a.split(",")[1] for a in revs_df.location]
revs_df['city'] = [a.split(",")[0] for a in revs_df.location]
del revs_df['Unnamed: 0']

In [218]:
# look at cleaned revs_df

revs_df.head(2)

Unnamed: 0,text,user_id,score,hostel,location,link,country,city
0,"Overall the hostel was great and nice, if a li...",2384221,8.0,Hostel Bell,"Prague, Czech Republic",http://www.hostelworld.com/hosteldetails.php/H...,Czech Republic,Prague
1,"There is free internet computer, near the metr...",2378241,9.7,Beau Site,"Brussels, Belgium",http://www.hostelworld.com/hosteldetails.php/B...,Belgium,Brussels


In [225]:
# save cleaned revs

revs_df.to_csv(path_or_buf='/Users/caitlinmowdy/Desktop/DSI-SF-2-caitlinmowdy/capstone-hostelworld/clean-data/clean_revs1.csv',encoding='utf8')