## Basic setting up

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import sqlalchemy
import datetime

In [2]:
listings = pd.read_csv("barcelona_listings.csv")

In [3]:
reviews = pd.read_csv("barcelona_reviews.csv")

In [4]:
calendar = pd.read_csv("barcelona_calendar.csv")

In [5]:
engine = create_engine("sqlite:///project.db")

In [6]:
def null_count(x):
    return listings[x].isna().sum()# / len(listings[x])

def unique_count(x):
    return listings.nunique()# / listings.shape[0] * 100

#for i,c in enumerate(listings.columns):
 #   print(c, unique_count(c)[i])

## Import accommodates

In [7]:
df = pd.DataFrame(listings.accommodates.unique(), columns=["accommodates"])

In [9]:
df.to_sql("Accommodation", engine, if_exists="append", index=False, dtype={"accommodates" : sqlalchemy.INT})

In [None]:
engine.execute("select count(*) from Accommodation ").fetchall()

## Import amenities

In [7]:
amenities = pd.DataFrame(listings.amenities.unique(), columns=["amenities"])

In [8]:
temp = []
for index, row in amenities.iterrows():
    temp.append(row["amenities"].split(','))

In [9]:
flat_list = []
for sublist in temp:
    for item in sublist:
        flat_list.append(item)

In [10]:
def remove_chars(item):
    for c in item:
        if c == "{" or c == "}" or c == "'" or c == '"' or c == "[" or c == "]":
            item = item.replace(c, "")
        item = item.strip().lower()
    return item

In [21]:
new_list = []
for item in flat_list:
    new_list.append('"' + remove_chars(item) + '"')
    

In [12]:
new_list = list(set(new_list))

In [13]:
new_list.remove('""')

In [91]:
new_list

['"espresso machine"',
 '"handheld shower head"',
 '"carbon monoxide detector"',
 '"stair gates"',
 '"elevator"',
 '"pets live on this property"',
 '"baby monitor"',
 '"ski-in/ski-out"',
 '"disabled parking spot"',
 '"building staff"',
 '"coffee maker"',
 '"private pool"',
 '"smart lock"',
 '"gym"',
 '"day bed"',
 '"accessible-height bed"',
 '"warming drawer"',
 '"shared pool"',
 '"microwave"',
 '"cooking basics"',
 '"wheelchair accessible"',
 '"pack ’n play/travel crib"',
 '"electric profiling bed"',
 '"free street parking"',
 '"balcony"',
 '"pillow-top mattress"',
 '"bidet"',
 '"air conditioning"',
 '"hbo go"',
 '"baby bath"',
 '"washer"',
 '"private living room"',
 '"single level home"',
 '"beachfront"',
 '"outdoor parking"',
 '"netflix"',
 '"dishes and silverware"',
 '"bath towel"',
 '"cat(s)"',
 '"sun loungers"',
 '"table corner guards"',
 '"changing table"',
 '"bbq grill"',
 '"dishwasher"',
 '"luggage dropoff allowed"',
 '"memory foam mattress"',
 '"free parking on premises"',
 '

In [14]:
amenities = pd.DataFrame(new_list, columns=["amenities"])

In [93]:
amenities.to_sql("Amenities", engine, if_exists="append", index=False, dtype={"amenities" : sqlalchemy.CHAR(32)})

In [24]:
listings.id.count()

13095

In [29]:
len(listings["amenities"].unique() == listings.amenities.unique())

11837

## Import Bathrooms

In [61]:
bathrooms = pd.DataFrame(listings.bathrooms.unique(), columns=["bathrooms"])

In [62]:
bathrooms = bathrooms.dropna()

In [63]:
bathrooms.to_sql("Bathrooms", engine, if_exists="append", index=False, dtype={"bathrooms" : sqlalchemy.FLOAT})

## Import Description

In [10]:
description = pd.DataFrame({"description_id": np.arange(len(listings.description)), "summary": listings.summary, "space": listings.space, "description": listings.description, "neighborhood_overview": listings.neighborhood_overview,"notes":listings.notes,"transit":listings.transit,"access":listings.access,"picture_url":listings.picture_url,"square_feet": listings.square_feet, "id": listings.id})

In [11]:
description["space"].fillna("",inplace=True)
description["neighborhood_overview"].fillna("",inplace=True)
description["description"].fillna("",inplace=True)
description["summary"].fillna("",inplace=True)
description["notes"].fillna("",inplace=True)
description["transit"].fillna("",inplace=True)
description["access"].fillna("",inplace=True)
description["picture_url"].fillna("",inplace=True)
description["square_feet"].fillna(-1,inplace=True)


In [14]:
description.to_sql("Described_Description", engine, if_exists="append", index=False, dtype={"description_id": sqlalchemy.INT, "summary": sqlalchemy.VARCHAR(1024), "space": sqlalchemy.VARCHAR(1024), "description": sqlalchemy.VARCHAR(1024), "neighborhood_overview": sqlalchemy.VARCHAR(1024),"notes":sqlalchemy.VARCHAR(1024),"transit": sqlalchemy.VARCHAR(1024),"access": sqlalchemy.VARCHAR(1024),"picture_url":sqlalchemy.CHAR(128),"square_feet": sqlalchemy.FLOAT, "id": sqlalchemy.INT})

## Import bedding

In [19]:
bedding = pd.DataFrame({"beds": listings.beds, "bed_type": listings.bed_type})

In [21]:
bedding = bedding.fillna(0).drop_duplicates()


In [22]:
bedding.to_sql("Bedding", engine, if_exists="append", index=False, dtype={"beds" : sqlalchemy.FLOAT, "bed_type" : sqlalchemy.CHAR(32)})

## Import Bedrooms

In [4]:
bedrooms = pd.DataFrame({"bedrooms": listings.bedrooms})

In [10]:
bedrooms = bedrooms.dropna().drop_duplicates()

In [11]:
bedrooms

Unnamed: 0,bedrooms
0,2.0
1,3.0
2,1.0
12,4.0
77,0.0
97,5.0
261,16.0
308,6.0
1436,12.0
2312,8.0


In [12]:
bedrooms.astype(int)

Unnamed: 0,bedrooms
0,2
1,3
2,1
12,4
77,0
97,5
261,16
308,6
1436,12
2312,8


In [13]:
bedrooms.to_sql("Bedrooms", engine, if_exists="append", index=False, dtype={"bedrooms" : sqlalchemy.INT})

## Import Room

In [14]:
room = pd.DataFrame({"room_type": listings.room_type})

In [17]:
room = room.drop_duplicates()

In [18]:
room.to_sql("Room", engine, if_exists="append", index=False, dtype={"room_type" : sqlalchemy.CHAR(32)})

## Import Property

In [19]:
prop = pd.DataFrame({"property_type": listings.property_type})

In [23]:
prop = prop.drop_duplicates()

In [24]:
prop.to_sql("Property", engine, if_exists="append", index=False, dtype={"propery_type" : sqlalchemy.CHAR(32)})

## Import Reviewer

In [29]:
reviewer = pd.DataFrame({"reviewer_id": reviews.reviewer_id, "reviewer_name" : reviews.reviewer_name})

In [50]:
reviewer = reviewer.drop_duplicates().drop(174201)

In [51]:
reviewer.to_sql("Reviewer", engine, if_exists="append", index=False, dtype={"reviewer_id" : sqlalchemy.INT, "reviewer_name" : sqlalchemy.CHAR(32)})

In [49]:
reviewer[reviewer.duplicated(['reviewer_id'], keep=False)]

Unnamed: 0,reviewer_id,reviewer_name
174201,6148697,Casa Nuna
179938,6148697,Mi Casa Bali


## Import Reviewed TODO

In [94]:
reviewed = pd.DataFrame({"id": reviews.listing_id, "reviewer_id" : reviews.reviewer_id, "comments": reviews.comments, "review_date" : reviews.date})

In [95]:
reviewed

Unnamed: 0,id,reviewer_id,comments,review_date
0,18666,1559265,Very nice flat. We had easy transportation to ...,2015-10-10
1,18674,4841196,"Great location. Clean, spacious flat. Would re...",2013-05-27
2,18674,11600277,Mi mejor recomendación para este departamento....,2014-03-02
3,18674,35231385,"Big apartment, well equipped.\nVery good servi...",2015-08-04
4,18674,23223644,The Check in was fast and flexible. The price ...,2016-06-20
5,18674,4756672,Great location and enough space in the apartme...,2018-06-18
6,21605,57647372,Meritxell était très accueillante et nous a mi...,2016-05-08
7,21605,62165990,Con mi esposa nos hospedamos en casa de Michae...,2016-05-22
8,21605,67778905,Even though the hosts were not there at the mo...,2016-05-27
9,21605,64869468,La habitación es perfecta para una pareja y la...,2016-05-29


In [100]:
reviewed[reviewed.duplicated(['id', "review_date","reviewer_id"], keep=False)]

Unnamed: 0,id,reviewer_id,comments,review_date
134162,1996730,85117476,"El apartamiento es increible, muy espacioso y ...",2016-07-23
134163,1996730,85117476,La segunda vez que me quedo en su piso y como ...,2016-07-23
210431,3823973,24495890,The reservation was canceled 17 days before ar...,2015-03-08
210796,3823973,24495890,The reservation was canceled 24 days before ar...,2015-03-08
212882,3823973,24495890,The reservation was canceled 10 days before ar...,2015-03-08
263426,6449766,57829170,The host canceled this reservation 24 days bef...,2018-06-05
263427,6449766,57829170,The host canceled this reservation 33 days bef...,2018-06-05
378123,16347842,19641299,The host canceled this reservation 11 days bef...,2017-11-16
378124,16347842,19641299,The host canceled this reservation 7 days befo...,2017-11-16


#### We need to change the primary key to (id, reviewer_id, date) and decide what to do with the duplicates

## Import Pricing

In [118]:
pricing = pd.DataFrame({"price": listings.price, "weekly_price" : listings.weekly_price, "monthly_price": listings.monthly_price, "security_deposit" : listings.security_deposit, "cleaning_fee" : listings.cleaning_fee, "guests_included" : listings.guests_included, "extra_people" : listings.extra_people, "minimum_nights" : listings.minimum_nights, "maximum_nights" : listings.maximum_nights, "id" : listings.id})

In [119]:
for i in pricing.index:
    pricing.at[i, "price"] = float(str(pricing.at[i, "price"])[1:].replace(",", ""))
    if not pd.isnull(pricing.at[i, "weekly_price"]):
        pricing.at[i, "weekly_price"] = float(str(pricing.at[i, "weekly_price"])[1:].replace(",", ""))
    if not pd.isnull(pricing.at[i, "monthly_price"]):
        pricing.at[i, "monthly_price"] = float(str(pricing.at[i, "monthly_price"])[1:].replace(",", ""))
    if not pd.isnull(pricing.at[i, "security_deposit"]):
        pricing.at[i, "security_deposit"] = float(str(pricing.at[i, "security_deposit"])[1:].replace(",", ""))
    if not pd.isnull(pricing.at[i, "cleaning_fee"]):
        pricing.at[i, "cleaning_fee"] = float(str(pricing.at[i, "cleaning_fee"])[1:].replace(",", ""))
    if not pd.isnull(pricing.at[i, "extra_people"]):
        pricing.at[i, "extra_people"] = float(str(pricing.at[i, "extra_people"])[1:].replace(",", ""))

In [120]:
for i in pricing.index:
    if pd.isnull(pricing.at[i, "weekly_price"]):
        pricing.at[i, "weekly_price"] = pricing.at[i, "price"] * 7
    if pd.isnull(pricing.at[i, "monthly_price"]):
        pricing.at[i, "monthly_price"] = pricing.at[i, "price"] * 30
    if pd.isnull(pricing.at[i, "security_deposit"]):
        pricing.at[i, "security_deposit"] = 0
    if pd.isnull(pricing.at[i, "cleaning_fee"]):
        pricing.at[i, "cleaning_fee"] = 0

In [123]:
pricing.to_sql("Pricing", engine, if_exists="append", index=False, dtype={"price": sqlalchemy.FLOAT, "weekly_price" : sqlalchemy.FLOAT, "monthly_price": sqlalchemy.FLOAT, "security_deposit" : sqlalchemy.FLOAT, "cleaning_fee" : sqlalchemy.FLOAT, "guests_included" : sqlalchemy.INT, "extra_people" : sqlalchemy.FLOAT, "minimum_nights" : sqlalchemy.INT, "maximum_nights" : sqlalchemy.INT, "id" : sqlalchemy.INT})

## Import Verification

In [50]:
verification = pd.DataFrame(listings.host_verifications.unique(), columns=["host_verifications"])

In [51]:
temp = []
for index, row in verification.iterrows():
    temp.append(row["host_verifications"].split(','))

In [52]:
flat_list = []
for sublist in temp:
    for item in sublist:
        flat_list.append(item)

In [55]:
new_list = []
for item in flat_list:
    new_list.append('"' + remove_chars(item) + '"')

In [56]:
new_list = list(set(new_list))

In [57]:
new_list.remove('""')

In [59]:
verification = pd.DataFrame({"host_verifications" : new_list})

In [61]:
verification.to_sql("Verification", engine, if_exists="append", index=False, dtype={"host_verifications" : sqlalchemy.CHAR(32)})

## Import verified_by TODO

## Import Calendar

In [40]:
cal = pd.DataFrame({"date" : calendar.date.unique()})

In [41]:
for i in cal.index:
    cal.at[i, "date"] = datetime.datetime.strptime(str(cal.at[i, "date"]), "%Y-%m-%d").date()

In [42]:
cal.to_sql("Calendar", engine, if_exists="append", index=False, dtype={"date" : sqlalchemy.DATE})

## Import Available_at

In [43]:
avail = pd.DataFrame({"id" : calendar.listing_id, "date" : calendar.date, "price" : calendar.price, "available" : calendar.available})

In [48]:
for i in calendar.index:
    avail.at[i, "date"] = datetime.datetime.strptime(str(calendar.at[i, "date"]), "%Y-%m-%d").date()

In [44]:
for i in avail.index:
    if not pd.isna(avail.at[i, "price"]):
        avail.at[i, "price"] = float(str(avail.at[i, "price"]).replace(",","").replace("$", ""))

In [45]:
avail = avail.fillna(-1)

In [49]:
avail.to_sql("Available_at", engine, if_exists="append", index=False, dtype={"id" : sqlalchemy.INT, "date" : sqlalchemy.DATE, "price" : sqlalchemy.INT, "available" : sqlalchemy.CHAR(1)})

## Import Policy

In [28]:
reduced = pd.DataFrame({"cancellation_policy" : listings["cancellation_policy"]})

In [34]:
reduced = reduced.drop_duplicates()

In [35]:
reduced.to_sql("Policy", engine, if_exists="append", index=False, dtype={"cancellation_policy" : sqlalchemy.CHAR(32)})

## Import Score

In [70]:
score = pd.DataFrame({"review_scores_rating" : listings.review_scores_rating, "review_scores_accuracy" : listings.review_scores_accuracy, "review_scores_cleanliness" : listings.review_scores_cleanliness, "review_scores_checkin" : listings.review_scores_checkin, "review_scores_communication" : listings.review_scores_communication, "review_scores_location" : listings.review_scores_location, "review_scores_value" : listings.review_scores_value, "id" : listings.id})

In [71]:
score = score.fillna(-1)

In [72]:
score.to_sql("Score", engine, if_exists="append", index=False, dtype={"review_scores_rating" : sqlalchemy.INT, "review_scores_accuracy" : sqlalchemy.INT, "review_scores_cleanliness" : sqlalchemy.INT, "review_scores_checkin" : sqlalchemy.INT, "review_scores_communication" : sqlalchemy.INT, "review_scores_location" : sqlalchemy.INT, "review_scores_value" : sqlalchemy.INT, "id" : sqlalchemy.INT})

## Import Host

In [79]:
host = pd.DataFrame({"host_id" : listings.host_id, "host_since" : listings.host_since, "host_about" : listings.host_about, "host_response_time" : listings.host_response_time, "host_response_rate" : listings.host_response_rate, "host_neighbourhood" : listings.host_neighbourhood, "host_url" : listings.host_url, "host_name" : listings.host_name, "host_thumbnail_url" : listings.host_thumbnail_url, "host_picture_url" : listings.host_picture_url})

In [80]:
host = host.fillna("Unknown")

In [86]:
for i in host.index:
    host.at[i, "host_since"] = datetime.datetime.strptime(str(host.at[i, "host_since"]), "%Y-%m-%d").date()

In [90]:
host = host.drop_duplicates()

In [91]:
host.to_sql("Host", engine, if_exists="append", index=False, dtype={"host_id" : sqlalchemy.CHAR(32), "host_since" : sqlalchemy.DATE, "host_about" : sqlalchemy.VARCHAR(1024), "host_response_time" : sqlalchemy.CHAR(32), "host_response_rate" : sqlalchemy.CHAR(32), "host_neighbourhood" : sqlalchemy.CHAR(32), "host_url" : sqlalchemy.CHAR(32), "host_name" : sqlalchemy.CHAR(32), "host_thumbnail_url" : sqlalchemy.CHAR(32), "host_picture_url" : sqlalchemy.CHAR(32)})

##### We need to add host_neighbourhood, city and country, infer it from the general dataset

## Import Provide

In [61]:
provide = listings[["id", "amenities"]]

In [62]:
temp2 = []
for index, row in provide.iterrows():
    temp2.append(row["amenities"].split(','))

In [63]:
removed_chars_list = []
for l in temp2:
    tmp = []
    for a in l:
        tmp.append(remove_chars(a))
    removed_chars_list.append(tmp)

In [64]:
ids = []
amens = []
for i, elem in enumerate(provide["id"]):
    for j in np.arange(len(removed_chars_list[i])):
        ids.append(elem)
        amens.append(removed_chars_list[i][j])
        


In [67]:
provides = pd.DataFrame({"id" : ids, "amenities" : amens})

In [71]:
provides = provides.drop_duplicates()

In [72]:
provides.to_sql("Provides", engine, if_exists="append", index=False, dtype={"id" : sqlalchemy.INT, "amenities" : sqlalchemy.CHAR(32)})

## Country

In [138]:
country = ["Spain", "Germany"]
c_code = ["ES", "DE"]

In [139]:
da_country = pd.DataFrame({"country" : country, "country_code" : c_code})

In [142]:
da_country

Unnamed: 0,country,country_code
0,Spain,ES
1,Germany,DE


In [143]:
da_country.to_sql("Country", engine, if_exists="append", index=False, dtype={"country_code" : sqlalchemy.CHAR(2), "country" : sqlalchemy.CHAR(32)})

## City

In [133]:
city= ["Barcelona","Madrid","Berlin"]

In [134]:
country_code = ["ES", "ES", "DE"]

In [135]:
da_city = pd.DataFrame({"city" : city, "country_code" : country_code})

In [137]:
da_city.to_sql("City", engine, if_exists="append", index=False, dtype={"city" : sqlalchemy.CHAR(32), "country_code" : sqlalchemy.CHAR(2)})

### Neighbourhood

In [98]:
nei = listings[["neighbourhood", "country_code", "city"]]

In [109]:
nei.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [105]:
for i in nei.index:
    nei.at[i, "city"] = "Barcelona"

In [130]:
nei.to_sql("Neighbourhood", engine, if_exists="append", index=False, dtype={"Neighbourhood" : sqlalchemy.CHAR(32), "country_code" : sqlalchemy.CHAR(2), "city" : sqlalchemy.CHAR(32)})

## Import Listing

In [144]:
da_listing = listings[["id", "listing_url", "name", "accommodates", "cancellation_policy", "host_id", "host_name", "neighbourhood", "city", "country_code", "latitude", "longitude", "property_type", "room_type", "bathrooms", "bedrooms", "beds", "bed_type", "interaction", "house_rules", "is_business_travel_ready", "require_guest_profile_picture", "require_guest_phone_verification"]]

In [147]:
da_listing.isna().sum()

id                                  0
listing_url                         0
name                                0
accommodates                        0
cancellation_policy                 0
host_id                             0
host_name                           0
neighbourhood                       0
city                                0
country_code                        0
latitude                            0
longitude                           0
property_type                       0
room_type                           0
bathrooms                           0
bedrooms                            0
beds                                0
bed_type                            0
interaction                         0
house_rules                         0
is_business_travel_ready            0
require_guest_profile_picture       0
require_guest_phone_verification    0
dtype: int64

In [146]:
da_listing.name = da_listing.name.fillna("")
da_listing.interaction = da_listing.interaction.fillna("")
da_listing.house_rules = da_listing.house_rules.fillna("")

da_listing.city = da_listing.city.fillna("Barcelona")
da_listing.bathrooms = da_listing.bathrooms.fillna(0)
da_listing.bedrooms = da_listing.bedrooms.fillna(0)
da_listing.beds = da_listing.beds.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [150]:
da_listing.to_sql("Listing", engine, if_exists="append", index=False, dtype={"id" : sqlalchemy.INT, "listing_url" : sqlalchemy.CHAR(32), "name" : sqlalchemy.CHAR(32), "accommodates" : sqlalchemy.CHAR(32), "cancellation_policy" : sqlalchemy.CHAR(32), "host_id" : sqlalchemy.INT,  "host_name" : sqlalchemy.CHAR(32), "neighbourhood": sqlalchemy.CHAR(32), "city" : sqlalchemy.CHAR(32), "country_code" : sqlalchemy.CHAR(2), "latitude" : sqlalchemy.FLOAT, "longitude" : sqlalchemy.FLOAT, "property_type": sqlalchemy.CHAR(32), "room_type" : sqlalchemy.CHAR(32), "bathrooms" : sqlalchemy.FLOAT, "bedrooms" : sqlalchemy.INT, "beds" : sqlalchemy.INT, "bed_type" : sqlalchemy.CHAR(32), "interaction" : sqlalchemy.VARCHAR(1024), "house_rules" : sqlalchemy.VARCHAR(1024), "is_business_travel_ready" : sqlalchemy.CHAR(1), "require_guest_profile_picture" : sqlalchemy.CHAR(1), "require_guest_phone_verification" : sqlalchemy.CHAR(1)})