# Part I. ETL Pipeline for Pre-Processing the Files

In [6]:
import pandas as pd
from src.utils.cassandra.generic_commands import CassandraCommands
import re
import os
import glob
import numpy as np
import json
import csv
from pathlib import Path

#### Creating list of filepaths to process original event csv data files

In [7]:
# Get your current folder and subfolder event data
event_data_file_path = Path('data/event_data')

# Create a for loop to create a list of files and collect each filepath
for root, dirs, files in os.walk(event_data_file_path):
    file_path_list = glob.glob(os.path.join(root,'*'))

#### Processing the files to create the data file csv that will be used for Apache Casssandra tables

In [35]:
# Initiating an empty list of rows that will be generated from each file
full_data_rows_list = [] 
    
# For every filepath in the file path list 
for f in file_path_list:

    # Read the csv file 
    with open(f, 'r', encoding = 'utf8', newline='') as csvfile: 
        # Creating a csv reader object 
        csvreader = csv.reader(csvfile) 
        next(csvreader)
        
        # Extract every row and append to list    
        for line in csvreader:
            full_data_rows_list.append(line) 
            
# Creating one csv file called event_data_new that will be used to insert data into the Apache Cassandra tables
csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

with open('data/event_data_new.csv', 'w', encoding = 'utf8', newline='') as f:
    writer = csv.writer(f, dialect='myDialect')
    writer.writerow(['artist','firstName','gender','itemInSession','lastName','length', 'level','location','sessionId','song','userId'])
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))

In [36]:
# Check the number of rows in your csv file
with open('data/event_data_new.csv', 'r', encoding = 'utf8') as f:
    print(sum(1 for line in f))

6821


# Part II. Complete the Apache Cassandra coding portion of your project. 

Columns:
- `artist`: Name of artist
- `firstName`: Name of user
- `gender`: Gender of user
- `itemInSession`: Item number in session
- `lastName`: Last name of user
- `length`: Duration of the song
- `level`: Paid or free song
- `location`: Location of the user
- `sessionId`: ID of the session
- `song title`: Title of the song
- `userId`: ID of the user

The image below is a screenshot of what the denormalized data should appear like in the <font color=red>**data/event_data_new.csv**</font> after the code above is run:<br>

<img src="../../../images/cassandra_project.jpg">

## Begin writing your Apache Cassandra code in the cells below

#### Creating a Cluster

In [10]:
# Create cassandra class
cassandra = CassandraCommands()

# Connect to cluster
cassandra.connect('127.0.0.1')

CassandraCommands class initiated
Connection setup with cassandra at 127.0.0.1


#### Create Keyspace

In [11]:
cassandra.create_keyspace('data_engeneering_nano_degree')

Keyspace named data_engeneering_nano_degree created


#### Set Keyspace

In [12]:
cassandra.connect_keyspace('data_engeneering_nano_degree') 

Connected to keyspace: data_engeneering_nano_degree


#### Understand datasets unique rows

In [37]:
all_event_data_path = Path('data/event_data_new.csv')
all_event_data_df = pd.read_csv(all_event_data_path)

# Check NULL values
print(f"Checking NULL values: \n\n{all_event_data_df.isnull().sum()}\n")
print(f"Total number of rows: {len(all_event_data_df)}")

# SessionId, itemInSessions identify a unique row
print(f"Total number of unique rows based on sessionId and itemInSessions: {len(all_event_data_df.groupby(['sessionId', 'itemInSession']))}")

Checking NULL values: 

artist           0
firstName        0
gender           0
itemInSession    0
lastName         0
length           0
level            0
location         0
sessionId        0
song             0
userId           0
dtype: int64

Total number of rows: 6820
Total number of unique rows based on sessionId and itemInSessions: 6820


#### Create tables

#### 1. Music by session

This table is generated to provide the artist, song title and song length in the music app history, which can be filtering on a specific session ID and item in session. Each row is uniquely identified per session ID and item in session. SessionId is used as a partition key together with itemInSession, no clustering columns are used.

The query we are modelling for is:

`SELECT artist, song, length FROM music_by_session WHERE sessionId = 388 and itemInSession = 4`

In [None]:
# Create table music_by_session
schema_music_by_session = '(sessionId int, itemInSession int, artist varchar, song varchar, length float, PRIMARY KEY ((sessionId, itemInSession)))'
cassandra.create_table(table_name='music_by_session', schema=schema_music_by_session)

In [90]:
# Insert data into music_by_session
for i, row in all_event_data_df.iterrows():
        cassandra.insert_rows(table_name='music_by_session', 
                              columns='(sessionId, itemInSession, artist, song, length)',
                              rows=[
                                   (row['sessionId'], row['itemInSession'], row['artist'], row['song'], row['length'])
                              ]
        )

In [41]:
# Validate data (used itemInSession = 1 instead of itemInSession = 4, since this record does not exist in dataset)
print(cassandra.custom_query(query='SELECT artist, song, length FROM music_by_session WHERE sessionId = 388 AND itemInSession = 4'))
print(cassandra.custom_query(query='SELECT artist, song, length FROM music_by_session WHERE sessionId = 388 AND itemInSession = 1'))

[]
[Row(artist='Pulp', song='Babies', length=244.08770751953125)]


#### 2. Music by user and session

This table is generated to provide the artist, song title, user first name and user last name in the music app history, which can be filtering on a specific userID and session ID. Each row is uniquely identified per session ID and item in session. The primary key consists of the userId and sessionId (partition columns) together with itemInSession (clustering key).

The query we are modelling for is:

`SELECT artist, song, firstName, lastName FROM music_by_user_session WHERE userId = 10 and sessionId = 182`


In [24]:
# Create table music_by_user_session
schema_music_by_user_session = '(userId int, sessionId int, itemInSession int, artist varchar, song varchar, firstName varchar, lastName varchar, PRIMARY KEY ((userId, sessionId), itemInSession))'
cassandra.create_table(table_name='music_by_user_session', schema=schema_music_by_user_session )

Table named music_app_history_2 created


In [28]:
# Insert data to music_by_user_session
for i, row in all_event_data_df.iterrows():
        cassandra.insert_rows(table_name='music_by_user_session', 
                              columns='(userId, sessionId, itemInSession, artist, song, firstName, lastName)',
                              rows=[
                                   (row['userId'], row['sessionId'], row['itemInSession'], row['artist'], row['song'], row['firstName'], row['lastName'])
                              ]
        )

In [29]:
# Validate data
cassandra.custom_query(query='SELECT artist, song, firstName, lastName FROM music_by_user_session WHERE userId = 10 AND sessionId = 182')

[Row(artist='Down To The Bone', song="Keep On Keepin' On", firstname='Sylvie', lastname='Cruz'),
 Row(artist='Three Drives', song='Greece 2000', firstname='Sylvie', lastname='Cruz'),
 Row(artist='Sebastien Tellier', song='Kilometer', firstname='Sylvie', lastname='Cruz'),
 Row(artist='Lonnie Gordon', song='Catch You Baby (Steve Pitron & Max Sanna Radio Edit)', firstname='Sylvie', lastname='Cruz')]

#### 3. Music by song

This table is generated to provide the user first name and user last name in the music app history, which can be filtering on a specific song title. Each row is uniquely identified per session ID and item in session. The primary key consists of the song (partition column) together with session ID and itemInSession (clustering columns).

The query we are modelling for is:

`SELECT firstName, lastName FROM music_by_song WHERE song = 'All Hands Against His Own`


In [None]:
# Create table music_by_song
schema_music_by_song = '(song varchar, sessionId int, itemInSession int, firstName varchar, lastName varchar, PRIMARY KEY (song, sessionId, itemInSession))'

cassandra.create_table(table_name='music_by_song', schema=schema_music_by_song)

In [None]:
# Insert data in music_by_song
for i, row in all_event_data_df.iterrows():
        cassandra.insert_rows(table_name='music_by_song', 
                              columns='(song, sessionId, itemInSession, firstName, lastName)',
                              rows=[
                                   (row['song'], row['sessionId'], row['itemInSession'], row['firstName'], row['lastName'])
                              ]
        )

In [34]:
# Validate data
cassandra.custom_query(query="SELECT firstName, lastName FROM music_by_song WHERE song = 'All Hands Against His Own'")

[Row(firstname='Sara', lastname='Johnson'),
 Row(firstname='Jacqueline', lastname='Lynch'),
 Row(firstname='Tegan', lastname='Levine')]

#### Drop the tables before closing out the sessions

In [4]:
cassandra.drop_table(table_name='music_by_session')
cassandra.drop_table(table_name='music_by_user_session')
cassandra.drop_table(table_name='music_by_song')

#### Close the session and cluster connection¶

In [None]:
cassandra.close_connection()