In [7]:
import numpy as np
import pandas as pd
import pyarrow
import random

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 500)

In [8]:
bookings_2_0_file='./Datasets/all_bookings2.0.csv'
ratings_2_0_file='./Datasets/all_ratings2.0.csv'
bookings_2=pd.read_csv(bookings_2_0_file)
ratings_2=pd.read_csv(ratings_2_0_file)

In [3]:
#Merge by booking_id
'''
Because we have bookings that don't belong to the same client that provides us with ratings, 
we are going to work with a copy of the bookings that have only those bookings that may have
an associated rating.
'''
#Keep only bookings that start with 'T'
filter=(bookings_2['booking_id'].str.startswith('T'))
bookings_2 = bookings_2[filter]

#Remove bookings that have som custom IDs that are longer than expected
filter=bookings_2['booking_id'].str.len()==10
bookings_2 = bookings_2[~filter]

#print(bookings_2['booking_id'].str.len().unique())

filter=(bookings_2['supplier'].isna())&(bookings_2['status']!='cancelado')
#print(sum(filter))#2145
bookings_2=bookings_2[~filter] 

#Find and remove duplicates. Perfect only 13.
test=bookings_2.groupby(['booking_id']).size().reset_index(name='count')
L=test[test['count'].astype(int)>=2]['booking_id'].tolist()
#print(L)
#print(len(L))
#print(bookings_2.shape)#64136
bookings_2=bookings_2.drop_duplicates(subset='booking_id', keep="last", inplace=False)
#print(bookings_2.shape)#64123




In [4]:
#Add a 'T' to the booking_id in ratings_2
#ratings_2['booking_id']=
ratings_2['booking_id']='T'+ratings_2['booking_id'].astype(str)

In [5]:
#Merge booking info with rating info
#Short 'rb' for rated bookings. For easy of typing.
rb=bookings_2.merge(ratings_2, how='inner', on='booking_id', suffixes=('_booking', '_rating'), copy=None, indicator=False, validate=None)
print(ratings_2.shape) #20421
print(rb.shape) #20123

L=rb['booking_id'].tolist()
ratings_2[~ratings_2['booking_id'].isin(L)].shape
# Conclusion
# There are about 298 bookings that have a rating but we either don't have 
# the pertaining booking on our records or we probably filtered it out in the previous
# steps because they did not have a supplier assigned.
# We reviewed a few cases randomly and they are system errors such as:
# - booking email never arrived, therefore we did not have the booking and 
# was never created in our system, however the passenger could not know about 
# this and because his booking did not arrive on time he/she posted an angry review.
# It is OK that they stay out of the analysis.




(20421, 11)
(20123, 32)


(298, 11)

In [6]:
#After all the changes, we proceed to save the changes in a file.
import os
outname = 'rated_bookings.csv'
outdir = './Datasets'
if not os.path.exists(outdir):
    os.mkdir(outdir)
fullname = os.path.join(outdir, outname)    

rb.to_csv(fullname, header=True)
print("Saved!")

Saved!
