# Exploratory Data Analysis

## Import raw data and have a look

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

raw_df = pd.read_csv("../data/raw/hotel_vienna_restricted.csv")
raw_df.head(10)

Unnamed: 0,country,city_actual,rating_count,center1distance,center1label,center2distance,center2label,neighbourhood,price,price_night,...,offer_cat,year,month,weekend,holiday,nnights,acc_type,ratings,distance,distance_alter
0,Austria,Vienna,189.0,1.7 miles,City centre,3.8 miles,Donauturm,17. Hernals,81,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,3.9,1.7,3.8
1,Austria,Vienna,53.0,1.4 miles,City centre,2.5 miles,Donauturm,Alsergrund,85,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,3.7,1.4,2.5
2,Austria,Vienna,55.0,1.7 miles,City centre,2.5 miles,Donauturm,Alsergrund,83,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,4.0,1.7,2.5
3,Austria,Vienna,33.0,1.2 miles,City centre,2.8 miles,Donauturm,Alsergrund,82,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,3.9,1.2,2.8
4,Austria,Vienna,57.0,0.9 miles,City centre,2.4 miles,Donauturm,Alsergrund,103,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,3.9,0.9,2.4
5,Austria,Vienna,161.0,1.0 mile,City centre,2.7 miles,Donauturm,Alsergrund,150,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,4.6,1.0,2.7
6,Austria,Vienna,203.0,1.1 miles,City centre,2.7 miles,Donauturm,Alsergrund,60,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,4.1,1.1,2.7
7,Austria,Vienna,251.0,0.8 miles,City centre,2.9 miles,Donauturm,Alsergrund,128,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,4.0,0.8,2.9
8,Austria,Vienna,18.0,1.7 miles,City centre,2.6 miles,Donauturm,Alsergrund,85,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,4.1,1.7,2.6
9,Austria,Vienna,20.0,0.9 miles,City centre,2.8 miles,Donauturm,Alsergrund,77,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,3.7,0.9,2.8


In [3]:
raw_df.shape

(217, 29)

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             217 non-null    object 
 1   city_actual         217 non-null    object 
 2   rating_count        217 non-null    float64
 3   center1distance     217 non-null    object 
 4   center1label        217 non-null    object 
 5   center2distance     217 non-null    object 
 6   center2label        217 non-null    object 
 7   neighbourhood       217 non-null    object 
 8   price               217 non-null    int64  
 9   price_night         217 non-null    object 
 10  city                217 non-null    object 
 11  stars               217 non-null    float64
 12  ratingta            214 non-null    float64
 13  ratingta_count      214 non-null    float64
 14  accommodationtype   217 non-null    object 
 15  guestreviewsrating  217 non-null    object 
 16  scarce_r

## Check on the missing values

In [5]:
raw_df.loc[raw_df["ratingta"].isnull()]

Unnamed: 0,country,city_actual,rating_count,center1distance,center1label,center2distance,center2label,neighbourhood,price,price_night,...,offer_cat,year,month,weekend,holiday,nnights,acc_type,ratings,distance,distance_alter
136,Austria,Vienna,6.0,0.8 miles,City centre,3.6 miles,Donauturm,Neubau,113,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,3.7,0.8,3.6
167,Austria,Vienna,18.0,2.6 miles,City centre,5.4 miles,Donauturm,Schonbrunn,83,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,4.5,2.6,5.4
204,Austria,Vienna,15.0,0.9 miles,City centre,3.6 miles,Donauturm,Wieden,126,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,4.5,0.9,3.6


## Get rid of the missing values

In [6]:
df=raw_df.dropna(subset=["ratingta"])
df

Unnamed: 0,country,city_actual,rating_count,center1distance,center1label,center2distance,center2label,neighbourhood,price,price_night,...,offer_cat,year,month,weekend,holiday,nnights,acc_type,ratings,distance,distance_alter
0,Austria,Vienna,189.0,1.7 miles,City centre,3.8 miles,Donauturm,17. Hernals,81,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,3.9,1.7,3.8
1,Austria,Vienna,53.0,1.4 miles,City centre,2.5 miles,Donauturm,Alsergrund,85,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,3.7,1.4,2.5
2,Austria,Vienna,55.0,1.7 miles,City centre,2.5 miles,Donauturm,Alsergrund,83,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,4.0,1.7,2.5
3,Austria,Vienna,33.0,1.2 miles,City centre,2.8 miles,Donauturm,Alsergrund,82,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,3.9,1.2,2.8
4,Austria,Vienna,57.0,0.9 miles,City centre,2.4 miles,Donauturm,Alsergrund,103,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,3.9,0.9,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,Austria,Vienna,77.0,1.2 miles,City centre,3.7 miles,Donauturm,Wieden,100,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,4.0,1.2,3.7
213,Austria,Vienna,572.0,1.5 miles,City centre,3.9 miles,Donauturm,Wieden,95,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,4.1,1.5,3.9
214,Austria,Vienna,53.0,1.5 miles,City centre,4.0 miles,Donauturm,Wieden,73,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,3.4,1.5,4.0
215,Austria,Vienna,112.0,1.0 mile,City centre,3.7 miles,Donauturm,Wieden,100,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,4.4,1.0,3.7


## Get rid of the hotels that are not in Vienna

In [8]:
df = df.loc[df["city_actual"] == "Vienna"]
df

Unnamed: 0,country,city_actual,rating_count,center1distance,center1label,center2distance,center2label,neighbourhood,price,price_night,...,offer_cat,year,month,weekend,holiday,nnights,acc_type,ratings,distance,distance_alter
0,Austria,Vienna,189.0,1.7 miles,City centre,3.8 miles,Donauturm,17. Hernals,81,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,3.9,1.7,3.8
1,Austria,Vienna,53.0,1.4 miles,City centre,2.5 miles,Donauturm,Alsergrund,85,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,3.7,1.4,2.5
2,Austria,Vienna,55.0,1.7 miles,City centre,2.5 miles,Donauturm,Alsergrund,83,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,4.0,1.7,2.5
3,Austria,Vienna,33.0,1.2 miles,City centre,2.8 miles,Donauturm,Alsergrund,82,price for 1 night,...,15-50% offer,2017,11,0,0,1,Hotel,3.9,1.2,2.8
4,Austria,Vienna,57.0,0.9 miles,City centre,2.4 miles,Donauturm,Alsergrund,103,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,3.9,0.9,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,Austria,Vienna,77.0,1.2 miles,City centre,3.7 miles,Donauturm,Wieden,100,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,4.0,1.2,3.7
213,Austria,Vienna,572.0,1.5 miles,City centre,3.9 miles,Donauturm,Wieden,95,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,4.1,1.5,3.9
214,Austria,Vienna,53.0,1.5 miles,City centre,4.0 miles,Donauturm,Wieden,73,price for 1 night,...,1-15% offer,2017,11,0,0,1,Hotel,3.4,1.5,4.0
215,Austria,Vienna,112.0,1.0 mile,City centre,3.7 miles,Donauturm,Wieden,100,price for 1 night,...,0% no offer,2017,11,0,0,1,Hotel,4.4,1.0,3.7


## Write out the munged data 

In [9]:
df.to_csv("../data/processed/hotel_vienna_munged.csv", index=False)