In [145]:
import numpy as np
from pyspark.sql import SparkSession

In [7]:
spark = (SparkSession
            .builder
            .appName("Python Spark create RDD example")
            .config("spark.some.config.option", "some-value")
            .getOrCreate())

# 1. Concat Data

In [39]:
# Buat menjadi fungsi
def load_data(filename):
    """return spark dataframe"""
    df = spark.read.csv(filenames[1],
                        sep = ';',
                        header=True, 
                        inferSchema=True)

    return df

def data_shape(data):
    """return the number of rows & cols"""
    n_rows = data.count()
    n_cols = len(data.columns)

    return n_rows, n_cols

def read_data(filenames):
    """return the concated data"""
    # Read the first data
    df = load_data(filenames[0])
    print('Data shape:', data_shape(df))

    # concat the rest data
    for filename in filenames[1:]:
        # read
        df_i = load_data(filename)
        print('Data shape:', data_shape(df_i))

        # then join
        df = df.union(df_i)

    # Ekstrak hasil
    print('Successfully read')
    print('Data shape:', data_shape(df))

    return df


In [40]:
filenames = [
    'dataset/branch_B.csv',
    'dataset/branch_C.csv',
    'dataset/branch_D.csv'
]

final_df = read_data(filenames)

Data shape: (328, 17)
Data shape: (328, 17)
Data shape: (328, 17)
Successfully read
Data shape: (984, 17)


In [41]:
final_df.show(10)

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+---------+-------------------+-------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity| Tax 5%|   Total|     Date|               Time|Payment|  cogs|gross margin percentage|gross income|Rating|
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+---------+-------------------+-------+------+-----------------------+------------+------+
|226-31-3081|     C|Naypyitaw|       Normal|Female|Electronic access...|     15.28|       5|   3.82|   80.22| 3/8/2019|2023-10-08 10:29:00|   Cash|  76.4|            4.761904762|        3.82|   9.6|
|699-14-3026|     C|Naypyitaw|       Normal|  Male|Electronic access...|     85.39|       7|29.8865|627.6165|3/25/2019|2023-10-08 18:30:00|Ewallet|597.73|            4.761904762|     29.8865|   4.1|
|315-

# 2. Get the Unwatched Movie

In [44]:
ratings = spark.read.csv('dataset/ratings.csv',
                         header=True,
                         inferSchema=True)

print('Data shape:', data_shape(ratings))
ratings.show(5)

Data shape: (100836, 4)
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [45]:
movies = spark.read.csv('dataset/movies.csv',
                        header=True,
                        inferSchema=True)

print('Data shape:', data_shape(movies))
movies.show(5)

Data shape: (9742, 3)
+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [84]:
def get_unwatched_movie(userId):
    """return unwatched movie id dataframe"""
    # filter the movie watched by userId
    ratings_user = ratings[ratings['userId']==userId]

    # Then, extract the movieId watched by that user
    watched_id = (ratings_user
                     .select('movieId')
                     .distinct()
                     .toPandas()['movieId']
                     .tolist())

    # Finally, select from movie the unwatched movie
    unwatched_movie = (movies
                          .filter(~movies['movieId'].isin(watched_id))
                          .select(['movieId', 'title', 'genres']))
    
    print('Data shape:', data_shape(unwatched_movie))
    return unwatched_movie
    

In [87]:
unwatched_movie = get_unwatched_movie(userId = 313)
unwatched_movie.show(5)

Data shape: (9402, 3)
+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



# 3. Get The House Recommendation

In [88]:
houses = spark.read.csv('dataset/travelio_dki_jakarta.csv',
                        header = True,
                        inferSchema = True)

print('Data shape:', data_shape(houses))
houses.show(5)

Data shape: (3725, 16)
+--------------------+-------------+--------------------+-------------+-----------+-------+------------------+------------------+--------+---------+-----+--------------+--------+------+------------+------------------------+
|            ads_name|property_type|      apartment_name|         area|     region|   city|          latitude|         longitude|bedrooms|bathrooms| size|  is_furnished|capacity|rating|yearly_price|property_management_type|
+--------------------+-------------+--------------------+-------------+-----------+-------+------------------+------------------+--------+---------+-----+--------------+--------+------+------------+------------------------+
|Cozy 1BR at Green...|    apartment|Apartemen Green C...|       Glodok|DKI Jakarta|Jakarta|-6.147707145070255|106.81552469730377|       1|        1| 41.0|Full Furnished|       2|  NULL|    40717500|                     TPM|
|Gading Nias Resid...|    apartment|Apartemen Gading ...|Kelapa Gading|DKI Jakart

In [306]:
import math
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf, lit

def get_filtered_data(filter_id, filter_value, data):
    """return filtered data"""
    # Get the condition
    if filter_value is None:
        filter_cond = None
    else:
        if filter_id in ['property_type', 'is_furnished']:
            filter_cond = data[filter_id] == filter_value
        elif filter_id in ['size', 'capacity']:
            filter_cond = data[filter_id] >= filter_value
        else:
            filter_cond = data[filter_id] <= filter_value

    # Filter the data
    if filter_cond is None:
        return data
    else:
        return data.filter(filter_cond)

def get_preferenced_data(user_config, data):
    """return the preferenced data"""
    # Get the preferences
    preferences = user_config['preferences']

    # loop for all filter condition
    for filter_id, filter_value in preferences.items():
        print('Filtering on:', filter_id)

        # Get the filtered data
        data = get_filtered_data(filter_id, filter_value, data)
        print('Done! Data shape:', data_shape(data))
        print('')

    # Return data
    print('Final Data Shape:', data_shape(data))
    return data

def calculate_haversine_distance(lat1, lon1, lat2, lon2):
    """return the haversine distance"""
    # First, convert the coordinate to radian
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Apply the function
    delta_lon = lon2 - lon1
    delta_lat = lat2 - lat1
    haver_formula = math.sin(delta_lat/2.0)**2 \
                    + math.cos(lat1) * math.cos(lat2) * math.sin(delta_lon/2.0)**2
    dist = 2 * 6356.752 * math.asin(math.sqrt(haver_formula))
    return round(dist, 4)
    
distance_cols = udf(calculate_haversine_distance, DoubleType())

def get_user_recommendation(n, user_config):
    """get user recommendation"""
    # Pertama, baca dulu dataframenya
    houses = spark.read.csv('dataset/travelio_dki_jakarta.csv',
                            header = True,
                            inferSchema = True)

    # Cari preferenced houses
    preferenced_houses = get_preferenced_data(user_config, houses)

    # Buat data distance
    print('')
    print('Sort the data based on user distance')
    preferenced_houses = (preferenced_houses
                            .withColumn('lat_user', lit(user_config['location']['latitude']))
                            .withColumn('lon_user', lit(user_config['location']['longitude'])))

    preferenced_houses = (preferenced_houses
                            .withColumn('distance', distance_cols(preferenced_houses['latitude'],
                                                                  preferenced_houses['longitude'],
                                                                  preferenced_houses['lat_user'],
                                                                  preferenced_houses['lon_user'])))

    # Cari rekomendasi
    final_houses = (preferenced_houses
                        .sort(['distance'], ascending=[True])
                        .drop(*['lat_user', 'lon_user'])
                        .limit(n))

    print('Done! Final data shape:', data_shape(final_houses))
    return final_houses
    

In [308]:
# Define the user data
user_config = {
    'preferences': {
        'property_type': None,
        'size': 45.0,
        'capacity': 4,
        'is_furnished': None,
        'yearly_price': 25000000
    },
    'location': {
        # Dekat Monumen Nasional (Monas)
        'latitude': -6.1792,
        'longitude': 106.8265
    }
}

final_houses = get_user_recommendation(n=10, user_config=user_config)
final_houses.show()

Filtering on: property_type
Done! Data shape: (3725, 16)

Filtering on: size
Done! Data shape: (1232, 16)

Filtering on: capacity
Done! Data shape: (809, 16)

Filtering on: is_furnished
Done! Data shape: (809, 16)

Filtering on: yearly_price
Done! Data shape: (10, 16)

Final Data Shape: (10, 16)

Sort the data based on user distance
Done! Final data shape: (10, 17)
+--------------------+-------------+--------------------+-------------------+-----------+-------+-------------------+------------------+--------+---------+-----+--------------+--------+------+------------+------------------------+--------+
|            ads_name|property_type|      apartment_name|               area|     region|   city|           latitude|         longitude|bedrooms|bathrooms| size|  is_furnished|capacity|rating|yearly_price|property_management_type|distance|
+--------------------+-------------+--------------------+-------------------+-----------+-------+-------------------+------------------+--------+-------

In [309]:
# Define the user data
user_config = {
    'preferences': {
        'property_type': None,
        'size': 60.0,
        'capacity': 4,
        'is_furnished': None,
        'yearly_price': 25000000
    },
    'location': {
        # Dekat Kota Tua Jakarta
        'latitude': -6.1378,
        'longitude': 106.8144
    }
}

final_houses = get_user_recommendation(n=10, user_config=user_config)
final_houses.show()

Filtering on: property_type
Done! Data shape: (3725, 16)

Filtering on: size
Done! Data shape: (869, 16)

Filtering on: capacity
Done! Data shape: (664, 16)

Filtering on: is_furnished
Done! Data shape: (664, 16)

Filtering on: yearly_price
Done! Data shape: (6, 16)

Final Data Shape: (6, 16)

Sort the data based on user distance
Done! Final data shape: (6, 17)
+--------------------+-------------+--------------+-------------------+-----------+-------+------------------+------------------+--------+---------+-----+--------------+--------+------+------------+------------------------+--------+
|            ads_name|property_type|apartment_name|               area|     region|   city|          latitude|         longitude|bedrooms|bathrooms| size|  is_furnished|capacity|rating|yearly_price|property_management_type|distance|
+--------------------+-------------+--------------+-------------------+-----------+-------+------------------+------------------+--------+---------+-----+--------------+-

# 4. Export the Promising State

In [310]:
sales = spark.read.csv('dataset/Amazon Sale Report.csv',
                       header = True,
                       inferSchema = True)

print('Data shape:', data_shape(sales))
sales.show()

Data shape: (128975, 24)
+-----+-------------------+--------+--------------------+----------+--------------+------------------+--------+-------------------+-------------+----+----------+--------------+---+--------+------+-----------+--------------+----------------+------------+--------------------+-----+------------+-----------+
|index|           Order ID|    Date|              Status|Fulfilment|Sales Channel |ship-service-level|   Style|                SKU|     Category|Size|      ASIN|Courier Status|Qty|currency|Amount|  ship-city|    ship-state|ship-postal-code|ship-country|       promotion-ids|  B2B|fulfilled-by|Unnamed: 22|
+-----+-------------------+--------+--------------------+----------+--------------+------------------+--------+-------------------+-------------+----+----------+--------------+---+--------+------+-----------+--------------+----------------+------------+--------------------+-----+------------+-----------+
|    0|405-8078784-5731545|04-30-22|           Cancelled|

In [316]:
(sales.limit(10).write
    .option('header', 'true')
    .csv('/dataset_final'))