In [None]:
import os
import glob
import csv
from cassandra.cluster import Cluster

In [None]:
events_file_path = os.getcwd() + '/event_data'

events_files = glob.glob(os.path.join(events_file_path, '*'))

In [None]:
def read_events_files(files):
    all_rows = []

    for file in files:
        with open(file, "r", encoding="utf8", newline="") as csvfile:
            csv_reader = csv.reader(csvfile)
            next(csv_reader)
            for row in csv_reader:
                all_rows.append(row)

    return all_rows


def events_preprocessing(all_rows):
    """this function create a new csv file for denormalized events data. The file should be smaller in size that the raw data file"""
    csv.register_dialect(
        "myDialect", quoting=csv.QUOTE_ALL, skipinitialspace=True)

    with open("event_datafile_new.csv", "w", encoding="utf8", newline="") as f:
        writer = csv.writer(f, dialect="myDialect")
        column_names = [
            "artist",
            "firstName",
            "gender",
            "itemInSession",
            "lastName",
            "length",
            "level",
            "location",
            "sessionId",
            "song",
            "userId",
        ]
        writer.writerow(column_names)
        for row in all_rows:
            if row[0] == "":
                continue
            writer.writerow(
                (
                    row[0],
                    row[2],
                    row[3],
                    row[4],
                    row[5],
                    row[6],
                    row[7],
                    row[8],
                    row[12],
                    row[13],
                    row[16],
                )
            )


all_rows = read_events_files(events_files)
events_preprocessing(all_rows)

In [None]:
def create_keyspace():
    """this method creates cluster and set a new keyspace if it does not exist"""
    cluster = Cluster(['127.0.0.1'])
    session = cluster.connect()
    try:
        session.execute("""
        CREATE KEYSPACE IF NOT EXISTS sparkify 
        WITH REPLICATION = 
        { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }
        """)

    except Exception as e:
        print(e)
    try:
        session.set_keyspace('sparkify')
    except Exception as e:
        print(e)
    return session


session = create_keyspace()

In [None]:
def create_table(table_name: str, query: str):
    query = "CREATE TABLE IF NOT EXISTS {} {};".format(table_name, query)
    try:
        session.execute(query)
    except Exception as e:
        print(e)


def insert_file_into_table(file_path, session, table_name, query):
    with open(file_path, encoding="utf8") as f:
        csvreader = csv.reader(f)
        next(csvreader)
        for row in csvreader:
            query = "INSERT INTO {}({})".format(table_name, query)
            query = query + " VALUES (%s, %s, %s, %s, %s)"

            session.execute(
                query, (int(row[8]), int(row[3]),
                        row[0], row[9], float(row[5]))
            )

### Create sessions table and insert data 

In [None]:
table_name = "sessions"
create_query = "(sessionId int, itemInSession int, artist_name text, song text, song_length float, PRIMARY KEY (sessionId, itemInSession))"
file_path = "event_datafile_new.csv"
insert_query = "sessionId, itemInSession, artist_name, song, song_length"
create_table(table_name, create_query)
insert_file_into_table(file_path, session, table_name, insert_query)

In [None]:
# test insertion into sessions
sessions_test_query = "select artist_name, song, song_length from sessions WHERE sessionId = 338 and itemInSession = 4"

try:
    rows = session.execute(sessions_test_query)
except Exception as e:
    print(e)

for row in rows:
    print(row)

### Create song_playlist table and insert data 

In [None]:

song_playlist_table = "song_playlist"
song_playlist_create_query = "(userid int, sessionid int, iteminsession int, firstname text, lastname text,  artist_name text, song text,\
PRIMARY KEY((userid, sessionid), iteminsession)) WITH CLUSTERING ORDER BY (iteminsession DESC)"
song_playlist_insert_query = "userid, sessionid, iteminsession, firstname, lastname,  artist_name, song"
create_table(song_playlist_table, song_playlist_create_query)
insert_file_into_table(
    file_path, session, song_playlist_table, song_playlist_insert_query)

In [None]:
# test insertion into song_playlist
song_playlist_test_query = "select artist_name, song, firstname,lastname, iteminsession from song_playlist_session where userid=10 and sessionid=182 "

try:
    rows = session.execute(song_playlist_test_query)
except Exception as e:
    print(e)

for row in rows:
    print(row)

### Create users_playlist table and insert data 

In [None]:
users_playlist_table = "users_playlist"
users_playlist_create_query = "(song text,  userid int, firstname text, lastname text,  PRIMARY KEY (song, userid)) WITH CLUSTERING ORDER BY (userid DESC)"
users_playlist_insert_query = "song, userid, firstname, lastname"
create_table(users_playlist_table, users_playlist_create_query)
insert_file_into_table(
    file_path, session, users_playlist_table, users_playlist_insert_query)

In [None]:
# test insertion into users_playlist
users_playlist_test_query = "select userid, firstname, lastname from users_playlist where song='All Hands Against His Own'"

try:
    rows = session.execute(song_playlist_test_query)
except Exception as e:
    print(e)

for row in rows:
    print(row)

### Drop Tables

In [None]:
def drop_table(table_name, session):
    query = "drop table {}".format(table_name)
    try:
        _ = session.execute(query)
    except Exception as e:
        print(e)


tables_to_drop = ["sessions", "song_playlist", "users_playlist"]
for table in tables_to_drop:
    drop_table(table, session)

In [None]:
session.shutdown()