# Library Things

In [None]:
!pip install duckdb --upgrade



In [None]:
!pip install polars --upgrade

Collecting polars
  Downloading polars-1.11.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading polars-1.11.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.0/34.0 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 1.7.1
    Uninstalling polars-1.7.1:
      Successfully uninstalled polars-1.7.1
Successfully installed polars-1.11.0


In [None]:
import duckdb
import polars as pl
from IPython.display import HTML

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# The Tables Preparation

In [None]:
# Connect to DuckDB
conn = duckdb.connect()

# Create the temporary table for storing Motorcycle data
create_motorcycle_table_query = """
CREATE TEMPORARY TABLE motorcycle (
    sale_id INTEGER,
    purchase_date DATE,
    motorcycle_name VARCHAR,
    motorcycle_group VARCHAR,
    dealer_origin VARCHAR,
    price FLOAT,
    qty FLOAT,
    total FLOAT
);
"""

# Execute the query to create the temporary table
conn.execute(create_motorcycle_table_query)

<duckdb.duckdb.DuckDBPyConnection at 0x787989e3c1f0>

In [None]:
# Define path for the CSV files
motorcycle_csv_file_path = "/content/drive/MyDrive/Nawatech Technical Test/motorcycle.csv"

# Load the data from the CSV file
load_motorcycle_table_query = f"COPY motorcycle FROM '{motorcycle_csv_file_path}';"

# Execute the query to load the data from the CSV file
conn.execute(load_motorcycle_table_query)

<duckdb.duckdb.DuckDBPyConnection at 0x787989e3c1f0>

In [None]:
# Verify the insertion by selecting some data
conn.sql("SELECT * FROM motorcycle LIMIT 5;").show()

┌─────────┬───────────────┬─────────────────┬──────────────────┬───────────────┬────────────┬───────┬────────────┐
│ sale_id │ purchase_date │ motorcycle_name │ motorcycle_group │ dealer_origin │   price    │  qty  │   total    │
│  int32  │     date      │     varchar     │     varchar      │    varchar    │   float    │ float │   float    │
├─────────┼───────────────┼─────────────────┼──────────────────┼───────────────┼────────────┼───────┼────────────┤
│       1 │ 2023-12-25    │ riorio          │ matic            │ bogor         │ 20000000.0 │   3.0 │ 20000000.0 │
│       3 │ 2023-12-20    │ riorio          │ matic            │ bogor         │ 20000000.0 │   2.0 │ 40000000.0 │
│       4 │ 2023-11-15    │ bitbit          │ matic            │ denpasar      │ 17000000.0 │   4.0 │ 68000000.0 │
│       5 │ 2023-12-11    │ bitbit          │ matic            │ bogor         │ 17000000.0 │   4.0 │ 34000000.0 │
│       7 │ 2023-11-07    │ vovo            │ cub              │ denpasar      │

# Perform the Data Transformation

**Basic Data Inspection: Summary Statistics**

In [None]:
motorcycle_df = conn.sql("SELECT * FROM motorcycle;").pl()

In [None]:
# Get summary statistics for string and numerical columns
summary = motorcycle_df.describe()

html_output = summary._repr_html_()
display(HTML(html_output))

statistic,sale_id,purchase_date,motorcycle_name,motorcycle_group,dealer_origin,price,qty,total
str,f64,str,str,str,str,f64,f64,f64
"""count""",2677.0,"""2677""","""2677""","""2677""","""2677""",2677.0,2677.0,2677.0
"""null_count""",0.0,"""0""","""0""","""0""","""0""",0.0,0.0,0.0
"""mean""",2478.749346,"""2023-11-30 02:51:03.429000""",,,,21398954.0,5.955547,56579008.0
"""std""",1451.349969,,,,,6229538.0,2.967544,21998560.0
"""min""",1.0,"""2023-11-01""","""arpus""","""cub""","""bandung""",15000000.0,1.0,15000000.0
"""25%""",1220.0,"""2023-11-15""",,,,17000000.0,3.0,40000000.0
"""50%""",2448.0,"""2023-11-29""",,,,20000000.0,6.0,60000000.0
"""75%""",3728.0,"""2023-12-15""",,,,20000000.0,9.0,75000000.0
"""max""",4998.0,"""2023-12-30""","""vovo""","""sport""","""palembang""",34000000.0,11.0,99000000.0


**Check for Missing Values**

In [None]:
# Count of null values per column
null_counts = motorcycle_df.null_count()

html_output = null_counts._repr_html_()
display(HTML(html_output))

sale_id,purchase_date,motorcycle_name,motorcycle_group,dealer_origin,price,qty,total
u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0


**Check the "purchase_date" timeframe (Nov - Dec 2023)**

In [None]:
# Create a new column to check if purchase_date is between November 2023 and December 2023
temp_df = motorcycle_df.with_columns(
    pl.col("purchase_date")
    .is_between(pl.date(2023, 11, 1), pl.date(2023, 12, 31))
    .alias("is_purchase_date_correct")
)

temp_df

sale_id,purchase_date,motorcycle_name,motorcycle_group,dealer_origin,price,qty,total,is_purchase_date_correct
i32,date,str,str,str,f32,f32,f32,bool
1,2023-12-25,"""riorio""","""matic""","""bogor""",2e7,3.0,2e7,true
3,2023-12-20,"""riorio""","""matic""","""bogor""",2e7,2.0,4e7,true
4,2023-11-15,"""bitbit""","""matic""","""denpasar""",1.7e7,4.0,6.8e7,true
5,2023-12-11,"""bitbit""","""matic""","""bogor""",1.7e7,4.0,3.4e7,true
7,2023-11-07,"""vovo""","""cub""","""denpasar""",1.5e7,10.0,1.5e7,true
…,…,…,…,…,…,…,…,…
4991,2023-12-07,"""riorio""","""matic""","""bogor""",2e7,5.0,4e7,true
4992,2023-12-03,"""bitbit""","""matic""","""denpasar""",1.7e7,3.0,6.8e7,true
4993,2023-11-16,"""riorio""","""matic""","""bogor""",2e7,7.0,6e7,true
4994,2023-11-08,"""bitbit""","""matic""","""palembang""",1.7e7,1.0,8.5e7,true


In [None]:
# Filter rows where 'is_purchase_date_correct' is False and check the row count
out_of_timeframe_count = temp_df.filter(pl.col("is_purchase_date_correct") == False).height

print(f"The row count for out of timeframe column: {out_of_timeframe_count}")

The row count for out of timeframe column: 0


**Check the Calculation of "Total" column**

In [None]:
# Calculate the correct total
temp_df = temp_df.with_columns((pl.col("price") * pl.col("qty")).alias("correct_total"))

# Compare the provided 'total' with the calculated total
temp_df = temp_df.with_columns((pl.col("total") == pl.col("correct_total")).alias("is_total_correct"))

# Display the results
temp_df.select(["sale_id", "total", "correct_total", "is_total_correct"])

sale_id,total,correct_total,is_total_correct
i32,f32,f32,bool
1,2e7,6e7,false
3,4e7,4e7,true
4,6.8e7,6.8e7,true
5,3.4e7,6.8e7,false
7,1.5e7,1.5e8,false
…,…,…,…
4991,4e7,1e8,false
4992,6.8e7,5.1e7,false
4993,6e7,1.4e8,false
4994,8.5e7,1.7e7,false


In [None]:
# Filter rows where 'is_total_correct' is False and sum the 'total' column
incorrect_count = temp_df.filter(pl.col("is_total_correct") == False).height

print(f"The row count of incorrect calculation column: {incorrect_count}")

The row count of incorrect calculation column: 2443


**Check other column values**

In [None]:
# Get unique motorcycle names
unique_motorcycles = temp_df.select(pl.col("motorcycle_name").unique())

unique_motorcycles

motorcycle_name
str
"""vovo"""
"""arpus"""
"""pax"""
"""bitbit"""
"""riorio"""
"""deva"""
"""cb-150r"""


In [None]:
# Get unique motorcycle group
unique_motorcycle_group = temp_df.select(pl.col("motorcycle_group").unique())

unique_motorcycle_group

motorcycle_group
str
"""cub"""
"""sport"""
"""matic"""


In [None]:
# Get unique dealer origin
unique_dealer_origin = temp_df.select(pl.col("dealer_origin").unique())

unique_dealer_origin

dealer_origin
str
"""denpasar"""
"""jakarta"""
"""bogor"""
"""palembang"""
"""bandung"""
"""medan"""


**Data Transformation**

In [None]:
# Apply the correct calculation to the 'total' column where 'is_total_correct' is False
motorcycle_df = temp_df.with_columns(
    pl.when(pl.col("is_total_correct") == False)
    .then(pl.col("price") * pl.col("qty"))
    .otherwise(pl.col("total"))
    .alias("correct_total")
)

# Drop the 'is_purchase_date_correct', 'total', and 'is_total_correct' columns and replace 'correct_total' with 'total'
motorcycle_df = motorcycle_df.drop(["is_purchase_date_correct", "total", "is_total_correct"]).rename({"correct_total": "total"})

In [None]:
motorcycle_df

sale_id,purchase_date,motorcycle_name,motorcycle_group,dealer_origin,price,qty,total
i32,date,str,str,str,f32,f32,f32
1,2023-12-25,"""riorio""","""matic""","""bogor""",2e7,3.0,6e7
3,2023-12-20,"""riorio""","""matic""","""bogor""",2e7,2.0,4e7
4,2023-11-15,"""bitbit""","""matic""","""denpasar""",1.7e7,4.0,6.8e7
5,2023-12-11,"""bitbit""","""matic""","""bogor""",1.7e7,4.0,6.8e7
7,2023-11-07,"""vovo""","""cub""","""denpasar""",1.5e7,10.0,1.5e8
…,…,…,…,…,…,…,…
4991,2023-12-07,"""riorio""","""matic""","""bogor""",2e7,5.0,1e8
4992,2023-12-03,"""bitbit""","""matic""","""denpasar""",1.7e7,3.0,5.1e7
4993,2023-11-16,"""riorio""","""matic""","""bogor""",2e7,7.0,1.4e8
4994,2023-11-08,"""bitbit""","""matic""","""palembang""",1.7e7,1.0,1.7e7


**Save the transformed data**

In [None]:
# Save the DataFrame to a CSV file
motorcycle_df.write_csv('/content/drive/MyDrive/Nawatech Technical Test/transformed_motorcycle.csv')
print("Data has been saved to '/content/drive/MyDrive/Nawatech Technical Test/transformed_motorcycle.csv'")

Data has been saved to '/content/drive/MyDrive/Nawatech Technical Test/transformed_motorcycle.csv'
