In [1]:
import boto3
import configparser


In [30]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY = config.get('default', 'aws_access_key_id')
SECRET = config.get('default','aws_secret_access_key')
ACCESSTOKEN = config.get('default', 'aws_session_token')
REGION = config.get('default', 'aws_region')

In [31]:
s3 = boto3.client('s3',
                    region_name=REGION,
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET,
                    aws_session_token=ACCESSTOKEN
)

In [33]:
# Creates in us-east-1 by default
bucket = s3.create_bucket(
    Bucket='hotelreviewslocalplaces'
    #CreateBucketConfiguration={
    #        'LocationConstraint': 'us-east-1'
    #}
)

In [29]:
# this is how you add a file using an s3 resource, not a client
object = s3.Object('hotelreviewslocalplaces', 'country_indicators.csv')

object.put(Body=open('Data/Cleaned/country_indicators.csv', 'rb'),
          Metadata={'datasource1': 'undp', 'datasource2': 'un data',
                   'datasource3': 'Freedom House', 'datasource4': 'Our World In Data'})

{'ResponseMetadata': {'RequestId': 'DF9A2AEE5F133A96',
  'HostId': 'OFJMpdT2x5+AXvc7vYcxcLOQ0lgiPab5S8xgXEIOa154WB8tqvXLkIqWWriUk6pv9ykypdEmbX0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'OFJMpdT2x5+AXvc7vYcxcLOQ0lgiPab5S8xgXEIOa154WB8tqvXLkIqWWriUk6pv9ykypdEmbX0=',
   'x-amz-request-id': 'DF9A2AEE5F133A96',
   'date': 'Tue, 16 Jun 2020 23:20:45 GMT',
   'etag': '"01ab936dc5d3de281e92ec6fb0022d9f"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"01ab936dc5d3de281e92ec6fb0022d9f"'}

In [37]:
response = s3.put_object(Body=open('Data/Cleaned/country_indicators.csv', 'rb'),
                         Bucket='hotelreviewslocalplaces',
                         Key='country_indicators.csv',
                         Metadata={'datasource1': 'undp', 'datasource2': 'un data',
                                   'datasource3': 'Freedom House', 'datasource4': 'Our World In Data'})
response

{'ResponseMetadata': {'RequestId': '0898D1C2B9BA6E8F',
  'HostId': 'IRvYvK6+zrfL0mUCIlYt2AOHjPtTNWTeahkbti10hFVp8HtOuri9+GQoIK3AWmRbtPhbV/sfA2E=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'IRvYvK6+zrfL0mUCIlYt2AOHjPtTNWTeahkbti10hFVp8HtOuri9+GQoIK3AWmRbtPhbV/sfA2E=',
   'x-amz-request-id': '0898D1C2B9BA6E8F',
   'date': 'Wed, 17 Jun 2020 00:02:35 GMT',
   'etag': '"01ab936dc5d3de281e92ec6fb0022d9f"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"01ab936dc5d3de281e92ec6fb0022d9f"'}

In [1]:

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, IntegerType, FloatType, StructType, DoubleType, DecimalType

In [2]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [27]:
#df_airports = spark.read.csv('Data/Cleaned/airports.csv', header=True)
df_airports = spark.read.parquet('Data/Cleaned/airports.parquet')

In [25]:
df_airports.count()

5908

In [28]:
df_airports.limit(10).toPandas()

Unnamed: 0,AirportID,Type,AirportName,Municipality,ISOCountry,Continent,ISORegion,Latitude,Longitude,ElevationInFeet,GPSCode,IATACode,LocalCode,Country
0,DE-0002,small_airport,Flugplatz Torgau-Beilrode,Torgau,DE,EU,DE-SN,51.571167,13.052167,269.0,EDOG,,,Germany
1,DE-0004,small_airport,Gransee Airport,Gransee,DE,EU,DE-BR,53.006699,13.205,164.0,,,,Germany
2,DE-0005,small_airport,Warngau Airfield,MÃ¼nchen,DE,EU,DE-BY,47.825011,11.703733,2380.0,,,,Germany
3,DE-0006,small_airport,Locktow Airport,Locktow,DE,EU,DE-BR,52.11639,12.709444,213.0,,,,Germany
4,DE-0029,small_airport,Berlinchen Airfield,Berlinchen,DE,EU,DE-BR,53.225178,12.565527,,,,,Germany
5,DE-0038,small_airport,Utscheid Segelflugplatz,Bitburg,DE,EU,DE-RP,49.997778,6.342778,1390.0,,,,Germany
6,DE-0039,small_airport,Aichach Airport,Aichach,DE,EU,DE-BY,48.471798,11.134349,1445.0,,,,Germany
7,DE-0040,small_airport,Zierenberg Airfield,Zierenberg,DE,EU,DE-HE,51.367,9.336,1450.0,,,,Germany
8,DE-0041,small_airport,Zellhausen Gliderport,Zellhausen,DE,EU,DE-HE,50.018845,8.984376,370.0,,,,Germany
9,DE-0042,small_airport,WÃ¼lzburg Airport,,DE,EU,DE-BY,49.025851,11.018746,2027.0,,,,Germany


In [29]:
df_hotels = spark.read.parquet('Data/Cleaned/hotels.parquet')

In [30]:
df_hotels.count()

1493

In [31]:
df_hotels.limit(5).toPandas()

Unnamed: 0,HotelID,HotelName,HotelAddress,Phone,Price,OriginalHotelName,GoogleAddress,Latitude,Longitude,gPlusPlaceId,NearestAirportID,MondayHours,TuesdayHours,WednesdayHours,ThursdayHours,FridayHours,SaturdayHours,SundayHours,Country
0,144,Relais Du Louvre,19 Rue Des Pretres Saint Germain L Auxerrois 1...,01 42 60 34 22,,Relais Du Louvre,"[3 Rue du Louvre, 75001 Paris, France]",48.8592264,2.340996,113034259603416119057,LFPB,8:00 am--9:00 pm,8:00 am--9:00 pm,8:00 am--9:00 pm,8:00 am--9:00 pm,8:00 am--9:00 pm,8:00 am--9:00 pm,8:00 am--9:00 pm,France
1,153,Four Seasons Hotel George V Paris,31 Avenue George V 8th arr 75008 Paris France,,,Four Seasons Hotel George V Paris,,48.8687512,2.3001692,,LFPV,,,,,,,,France
2,420,BEST WESTERN Premier Trocadéro La Tour,5 bis rue Massenet 16th arr 75016 Paris France,01 45 24 43 03,,Best Western Premier Trocadero La Tour,"[5 Rue Massenet, 75016 Paris, France]",48.8583127,2.2795075,107754700607079935569,LFPV,,,,,,,,France
3,1013,Novotel Paris Centre Gare Montparnasse,17 Rue Du Cotentin 15th arr 75015 Paris France,,,Novotel Paris Centre Gare Montparnasse,,48.838484,2.3150904,,LFPV,,,,,,,,France
4,499,Novotel Barcelona City,Avenida Diagonal 201 Sant Mart 08018 Barcelona...,933 26 24 99,,Novotel Barcelona City,"[Avinguda Diagonal, 199, 08018 Barcelona, Spain]",41.4038774,2.1911894,115909827232414886024,LELL,1:00--4:00 pm,1:00--4:00 pm,1:00--4:00 pm,1:00--4:00 pm,1:00--4:00 pm,1:00--4:00 pm,1:00--4:00 pm,Spain


In [4]:
df_hotel_reviews = spark.read.json('Data/Cleaned/reviews.jsonl')

In [5]:
df_hotel_reviews.count()

515349

In [7]:
df_hotel_reviews.limit(5).toPandas()

Unnamed: 0,AverageScore,HotelID,NegativeReview,NumRatings,NumReviews,PositiveReview,ReviewDate,ReviewNegativeWordsCount,ReviewPositiveWordsCount,ReviewerNationality,ReviewerScore,Tags,TotalNumReviewsByReviewer
0,7.7,298,Rooms are nice but for elderly a bit difficult...,194,1403,Location was good and staff were ok It is cute...,2017-07-31,42,21,Australia,7.1,"[' Leisure trip ', ' Family with young childre...",9
1,7.7,298,My room was dirty and I was afraid to walk bar...,194,1403,Great location in nice surroundings the bar an...,2017-07-31,210,26,United Kingdom,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",1
2,7.7,298,No Negative,194,1403,Great restaurant bar very green area you dine ...,2017-07-31,0,18,United Kingdom,9.6,"[' Leisure trip ', ' Group ', ' Duplex Double ...",1
3,7.7,298,Beds sucked Air conditioner too loud for use,194,1403,Nice property and building,2017-07-31,10,6,Canada,7.5,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",1
4,7.7,1199,for the time we were there everything was good,1058,4380,Rooms lovely and tastefully decorated staff ve...,2017-07-31,10,16,United Kingdom,10.0,"[' Business trip ', ' Couple ', ' Deluxe Doubl...",9
