# Introduction

Set Up

In [11]:
# Standard imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from collections import Counter
from scipy import stats
from scipy.stats import norm

In [12]:
# show all dataframe columns
pd.set_option('display.max_columns', None)
# set matplotlib global settings eg. figsize
plt.rcParams['figure.figsize'] = (8.0, 6.0)

In [13]:
#import data
original_df = pd.read_csv('../data/review_analysis/my_data.csv')

In [14]:
#quick look into the data
original_df.head()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


In [15]:
#checking dimensions of data
original_df.shape

(35912, 19)

In [16]:
#Review datatypes and null values
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35912 entries, 0 to 35911
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   address               35912 non-null  object 
 1   categories            35912 non-null  object 
 2   city                  35912 non-null  object 
 3   country               35912 non-null  object 
 4   latitude              35826 non-null  float64
 5   longitude             35826 non-null  float64
 6   name                  35912 non-null  object 
 7   postalCode            35857 non-null  object 
 8   province              35912 non-null  object 
 9   reviews.date          35653 non-null  object 
 10  reviews.dateAdded     35912 non-null  object 
 11  reviews.doRecommend   0 non-null      float64
 12  reviews.id            0 non-null      float64
 13  reviews.rating        35050 non-null  float64
 14  reviews.text          35888 non-null  object 
 15  reviews.title      

In [17]:
original_df.nunique()

address                   999
categories                396
city                      761
country                     1
latitude                  982
longitude                 983
name                      879
postalCode                911
province                  287
reviews.date             3009
reviews.dateAdded        1029
reviews.doRecommend         0
reviews.id                  0
reviews.rating             43
reviews.text            34396
reviews.title           21960
reviews.userCity         2897
reviews.username        15492
reviews.userProvince      648
dtype: int64

# Action Plan
1. Review Columns
    - Look into date column
    - Look into categorical columns
    - Look into numeric columns
    - Drop redundant columns
2. Identify any null values 
    - Decide to delete/impute null values
3. Check for errors in data

## Variable Tables - Initial Impressions

**Indentified Variables to Keep**
| **Variables to Keep** | **Nulls** | **Non Nulls** | **Null Count** | **Data Type** |                   **Comment**                  |
|-----------------------|:---------:|:-------------:|:--------------:|---------------|:----------------------------------------------:|
| address               |     N     |     35912     |        0       |     object    |                                                |
| categories            |     N     |     35912     |        0       |     object    |                                                |
| city                  |     N     |     35912     |        0       |     object    |                                                |
| name                  |     N     |     35912     |        0       |     object    |                                                |
| reviews.dateAdded     |     N     |     35912     |        0       |     object    |                                                |
| reviews.text          |     Y     |     35888     |       24       |     object    | Has nulls, but can drop them. Unable to impute | 

<br>

**Identified Variables to Drop**
| **Variables to Drop** | **Nulls** | **Non Nulls** | **Null Count** | **Data Type** |                       **Comment**                      |
|-----------------------|:---------:|:-------------:|:--------------:|:-------------:|:------------------------------------------------------:|
| latitude              |     Y     |     35826     |       86       |    float64    | Have address and city for location                     |
| longitude             |     Y     |     35826     |       86       |    float64    | Have address and city for location                     |
| postalCode            |     Y     |     35857     |       55       |     object    | Have address and city for location                     |
| reviews.date          |     Y     |     35653     |       259      |     object    | have reviews.dateAdded                                 |
| reviews.doRecommend   |     Y     |       0       |      35912     |    float64    | No values                                              |
| reviews.id            |     Y     |       0       |      35912     |    float64    | No values                                              |
| reviews.userCity      |     Y     |     16263     |      19649     |     object    | Too many nulls, may not be able to impute              |
| reviews.username      |     Y     |     35869     |       43       |     object    | Unable to impute                                       |
| reviews.userProvince  |     Y     |     17518     |      18394     |     object    | Similar to user city, too many nulls, unable to impute |

<br>

**Undecided Variables**
| **Unsure Variables** | **Nulls** | **Non Nulls** | **Null Count** | **Data Type** |                                **Comment**                                |
|----------------------|:---------:|:-------------:|:--------------:|:-------------:|:-------------------------------------------------------------------------:|
| country              |     N     |     35912     |        0       |     object    | If I bring another dataset in   from another country, this may be helpful |
| province             |     N     |     35912     |        0       |     object    | Unsure if this is necessary if we have city                               |
| reviews.rating       |     Y     |     35050     |       862      |    float64    | Unable to impute rating, is dropping 862 rows worth keeping?              |
| reviews.title        |     Y     |     34286     |      1626      |     object    | Unsure if this is necessary                                               |

## Initial Look at Values

**Remove Reduntant columns**
1. Drop redundant/uncessary columns (9 columns)<br>
<br> 

**Decide on Unsure Variables**

- `country` - If keeping, no change needed
- `province` - Most likely drop
- `reviews.rating` - If keeping, convert to ordinal since ratings only go from 1-5
- `reviews.title` - Maybe merge this into the `reviews.text` so I dont need to impute?


**When columns have been decided on and nulls have been addressed**
1. Identify `reviews.text` null values. Delete Rows? How do you delete a row?
2. Check for errors in the columns

In [18]:
#Saving copy of original DF and dropping the 2 null columns
hotel_review_df = original_df.drop(columns=['reviews.doRecommend','reviews.id']).copy()

In [20]:
#checking reviews.dorecommend and reviews.id are removed
hotel_review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35912 entries, 0 to 35911
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   address               35912 non-null  object 
 1   categories            35912 non-null  object 
 2   city                  35912 non-null  object 
 3   country               35912 non-null  object 
 4   latitude              35826 non-null  float64
 5   longitude             35826 non-null  float64
 6   name                  35912 non-null  object 
 7   postalCode            35857 non-null  object 
 8   province              35912 non-null  object 
 9   reviews.date          35653 non-null  object 
 10  reviews.dateAdded     35912 non-null  object 
 11  reviews.rating        35050 non-null  float64
 12  reviews.text          35888 non-null  object 
 13  reviews.title         34286 non-null  object 
 14  reviews.userCity      16263 non-null  object 
 15  reviews.username   

# Review Columns

***
## Looking into the Date Columns
- `reviews.dateAdded`= date the review was added to the database
- `reviews.date`= date the review was posted online <--this is the more important value

In [33]:
#reviews.dateAdded and reviews.date
hotel_review_df[['reviews.dateAdded', 'reviews.date']]

Unnamed: 0,reviews.dateAdded,reviews.date
0,2016-10-24,2013-09-22T00:00:00Z
1,2016-10-24,2015-04-03T00:00:00Z
2,2016-10-24,2014-05-13T00:00:00Z
3,2016-10-24,2013-10-27T00:00:00Z
4,2016-10-24,2015-03-05T00:00:00Z
...,...,...
35907,2016-06-23,2012-03-20T00:00:00Z
35908,2016-06-23,2012-03-12T00:00:00Z
35909,2016-06-23,2015-11-03T00:00:00Z
35910,2015-12-01,


In [24]:
hotel_review_df['reviews.dateAdded'].describe()

count                    35912
unique                    1029
top       2017-04-20T01:34:00Z
freq                      1185
Name: reviews.dateAdded, dtype: object

In [34]:
#convert reviews.dateadded to datetime
hotel_review_df['reviews.dateAdded'] = pd.to_datetime(hotel_review_df['reviews.dateAdded']).dt.date

#convert reviews.date to datetime
hotel_review_df['reviews.date'] = pd.to_datetime(hotel_review_df['reviews.date']).dt.date


In [35]:
hotel_review_df.head()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22,2016-10-24,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03,2016-10-24,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13,2016-10-24,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27,2016-10-24,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05,2016-10-24,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


In [None]:
#Comparing the two date columns, they are not the same. 
hotel_review_df[['reviews.dateAdded', 'reviews.date']].tail()

Unnamed: 0,reviews.dateAdded,reviews.date
35907,2016-06-23,2012-03-20
35908,2016-06-23,2012-03-12
35909,2016-06-23,2015-11-03
35910,2015-12-01,NaT
35911,2015-12-01,NaT


After

***
## Looking at the Categoric Columns


***
## Looking at the Numeric Columns