# Demo 4: Apache Cassandra

### Use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: 
More documentation can be found here:  https://datastax.github.io/python-driver/

### Import Apache Cassandra python package

In [3]:
! pip install cassandra-driver



In [4]:
import cassandra

### Create a connection to the database
1. Connect to the local instance of Apache Cassandra *['127.0.0.1']*.
2. Once we get back the cluster object, we need to connect and that will create our session that we will use to execute queries.<BR><BR>
    
*Note 1:* This block of code will be standard in all notebooks

In [1]:
from cassandra.cluster import Cluster
try: 
    cluster = Cluster(['127.0.0.1'], port=9042) 
    session = cluster.connect()
except Exception as e:
    print(e)
 

In [2]:
try: 
    rows = session.execute("""DESCRIBE keyspaces""")
except Exception as e:
    print(e)

for row in rows:
    print (row)

Row(keyspace_name='demokeyspace', type='keyspace', name='demokeyspace')
Row(keyspace_name='system', type='keyspace', name='system')
Row(keyspace_name='system_auth', type='keyspace', name='system_auth')
Row(keyspace_name='system_distributed', type='keyspace', name='system_distributed')
Row(keyspace_name='system_schema', type='keyspace', name='system_schema')
Row(keyspace_name='system_traces', type='keyspace', name='system_traces')
Row(keyspace_name='system_views', type='keyspace', name='system_views')
Row(keyspace_name='system_virtual_schema', type='keyspace', name='system_virtual_schema')
Row(keyspace_name='university', type='keyspace', name='university')


In [7]:
try: 
    rows = session.execute("""SELECT * FROM system_schema.keyspaces""")
except Exception as e:
    print(e)

for row in rows:
    print (row)

Row(keyspace_name='system_auth', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '1')]))
Row(keyspace_name='system_schema', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.LocalStrategy')]))
Row(keyspace_name='system_distributed', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')]))
Row(keyspace_name='system', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.LocalStrategy')]))
Row(keyspace_name='system_traces', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '2')]))


### Create a keyspace to the work in 
*Note:* We will ignore the Replication Strategy and factor information right now as those concepts will be covered in detail as we go along. Remember, this will be the strategy and replication factor on a one node local instance. 

In [3]:
try:
    session.execute("""
    CREATE KEYSPACE IF NOT EXISTS demokeyspace 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
)
#  { 'class' : 'NetworkTopologyStrategy', 'datacenter1' : 3 }
#  { 'class' : 'NetworkTopologyStrategy', 'DCSydney' : 3, 'DCSingapore' : 1, 'DCUSA' : 1 } 

    rows = session.execute("""SELECT * FROM system_schema.keyspaces""")
except Exception as e:
    print(e)
    
for row in rows:
    print (row)

Row(keyspace_name='system_auth', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '1')]))
Row(keyspace_name='demokeyspace', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '2')]))
Row(keyspace_name='system_schema', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.LocalStrategy')]))
Row(keyspace_name='system_distributed', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')]))
Row(keyspace_name='system', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.LocalStrategy')]))
Row(keyspace_name='system_traces', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('re

In [4]:
try: 
    session.execute("""ALTER KEYSPACE demokeyspace 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 2 }""")
    rows = session.execute("""SELECT * FROM system_schema.keyspaces""")
except Exception as e:
    print(e)

for row in rows:
    print (row)

Row(keyspace_name='system_auth', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '1')]))
Row(keyspace_name='demokeyspace', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '2')]))
Row(keyspace_name='system_schema', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.LocalStrategy')]))
Row(keyspace_name='system_distributed', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')]))
Row(keyspace_name='system', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.LocalStrategy')]))
Row(keyspace_name='system_traces', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('re

### Connect to our Keyspace.<br>
*Compare this to how a new session in PostgreSQL is created.*

In [5]:
try:
    session.set_keyspace('demokeyspace')
except Exception as e:
    print(e)

### Begin with creating a Music Library of albums. Each album has a lot of information we could add to the music library table. We will  start with album name, artist name, year. 

### But ...Stop

### We are working with Apache Cassandra a NoSQL database. We can't model our data and create our table without more information.

### Think about what queries will you be performing on this data?

#### We want to be able to get every album that was released in a particular year. 
`select * from music_library WHERE YEAR=1970`

*To do that:* <ol><li> We need to be able to do a WHERE on YEAR. <li>YEAR will become my partition key,<li>artist name will be my clustering column to make each Primary Key unique. <li>**Remember there are no duplicates in Apache Cassandra.**</ol>

**Table Name:** music_library<br>
**column 1:** Album Name<br>
**column 2:** Artist Name<br>
**column 3:** Year <br>
PRIMARY KEY(year, artist name)


### Now to translate this information into a Create Table Statement. 
More information on Data Types can be found here: https://datastax.github.io/python-driver/<br>

In [6]:
query = """CREATE TABLE IF NOT EXISTS music_library (year int, artist_name text, album_name text, 
PRIMARY KEY (year,artist_name));"""

# query = """CREATE TABLE IF NOT EXISTS music_library (year int, artist_name text, album_name text, 
# PRIMARY KEY ((year,artist_name), album_name)) WITH clustering ORDER BY(album_name DESC);"""

try:
    session.execute("drop table if exists music_library")
    session.execute(query)
except Exception as e:
    print(e)

### Insert a record

In [7]:
query = "INSERT INTO music_library (year, artist_name, album_name)"
query = query + " VALUES (%s, %s, %s)"

try:
    session.execute(query, (1970, "The Beatles", "Let it Be"))
except Exception as e:
    print(e)       

### Validate your data was inserted into the table.

In [8]:
query = 'SELECT * FROM music_library'
try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.album_name, row.artist_name)

1970 Let it Be The Beatles


#### Altering a  table and adding or dropping a new column

In [9]:
query = "ALTER TABLE music_library ADD  sales int "
try:
    session.execute(query)
    rows = session.execute('SELECT * FROM music_library')
except Exception as e:
    print(e)
    
for row in rows:
    print (row)

Row(year=1970, artist_name='The Beatles', album_name='Let it Be', sales=None)


#### Adding Multiple Records at a time 

In [10]:
from cassandra.query import BatchStatement

data = [[2001,'A','B',1000],
        [2002,'C','D',5000],
        [2001,'E','H',2000],
        [2002,'G','F',1000],
        [2002,'G','F',5000]] 

prepared = session.prepare("INSERT INTO music_library (year, artist_name, album_name, sales) VALUES (?,?, ?, ?)")
try:
    batch = BatchStatement()
    for i in range(len(data)):
        batch.add(prepared, (data[i][0], data[i][1], data[i][2], data[i][3]))
    
    session.execute(batch)
    rows = session.execute('SELECT * FROM music_library')
except Exception as e:
    print(e)
    
for row in rows:
    print (row)

Row(year=2001, artist_name='A', album_name='B', sales=1000)
Row(year=2001, artist_name='E', album_name='H', sales=2000)
Row(year=2002, artist_name='C', album_name='D', sales=5000)
Row(year=2002, artist_name='G', album_name='F', sales=5000)
Row(year=1970, artist_name='The Beatles', album_name='Let it Be', sales=None)


### Data Filtering (Using Partitioning Columns)

In [17]:
# query = "select * from music_library WHERE year=1970"
# query = "select * from music_library WHERE artist_name='A'"
# query = "select * from music_library WHERE artist_name='C' ALLOW FILTERING"
# query = "select * from music_library WHERE album_name='B'"
# query = "select * from music_library WHERE YEAR=1970 and artist_name='The Beatles'"
# query = "select * from music_library WHERE YEAR=2002 and artist_name='C' and album_name = 'D'"
try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.album_name, row.artist_name)

Error from server: code=2200 [Invalid query] message="Cannot execute this query as it might involve data filtering and thus may have unpredictable performance. If you want to execute this query despite the performance unpredictability, use ALLOW FILTERING"


### Creating an Index

In [18]:
try:
    session.execute('CREATE INDEX indexname ON music_library  (album_name);')
except Exception as e:
    print(e)

### Order By

In [28]:
#query = 'SELECT * FROM music_library ORDER BY YEAR'
#query = 'SELECT * FROM music_library WHERE YEAR IN (2001,2002)'
#query = 'SELECT * FROM music_library WHERE YEAR IN (2001,2002) ORDER BY YEAR DESC'
#query = 'SELECT * FROM music_library WHERE YEAR IN (2001,2002) ORDER BY ARTIST_NAME DESC'
#query = 'SELECT * FROM music_library ORDER BY ALBUM_NAME'
#query = 'SELECT * FROM music_library WHERE YEAR = 2001 ORDER BY ALBUM_NAME'
#query = 'SELECT * FROM music_library WHERE YEAR = 2001 ORDER BY ARTIST_NAME'

try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.album_name, row.artist_name)

2001 B A
2001 H E


### Group By

In [31]:
#query = 'SELECT * FROM music_library Group By Year'
#query = 'SELECT * FROM music_library Group By artist_name'
query = 'SELECT * FROM music_library Group By year, artist_name'
# query = 'SELECT * FROM music_library Group By album_name'

try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.album_name, row.artist_name)

2001 B A
2001 H E
2002 D C
2002 F G
1970 Let it Be The Beatles


### Aggregations 

In [44]:
# query = 'SELECT count(*) FROM music_library'
# query = 'SELECT max(year) FROM music_library'
# query = 'SELECT min(year) FROM music_library'
# query = 'SELECT sum(sales) FROM music_library'
# query = 'SELECT avg(sales) FROM music_library'
# query = 'SELECT avg(sales) FROM music_library where year = 2001'
# query = 'SELECT avg(sales) FROM music_library where year in (2001,2002)'


try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row)

Row(system_avg_sales=3250)


### Altering Records 

In [49]:
# query = 'UPDATE music_library SET sales=10000 WHERE year = 1970'
query = "UPDATE music_library SET sales=10000 WHERE year = 1970 and artist_name = 'The Beatles'"

try:
    session.execute(query)
    rows = session.execute('SELECT * FROM music_library')
except Exception as e:
    print(e)
    
for row in rows:
    print (row)

Row(year=2001, artist_name='A', album_name='B', sales=1000)
Row(year=2001, artist_name='E', album_name='H', sales=2000)
Row(year=2002, artist_name='C', album_name='D', sales=5000)
Row(year=2002, artist_name='G', album_name='F', sales=5000)
Row(year=1970, artist_name='The Beatles', album_name='Let it Be', sales=10000)


### Drop the table and keyspace to avoid duplicates and clean up. 

In [50]:
query = "drop table music_library"
try:
    session.execute(query)
except Exception as e:
    print(e)
    

In [51]:
try: 
    session.execute("""DROP KEYSPACE demokeyspace""")
    rows = session.execute("""DESCRIBE keyspaces""")
except Exception as e:
    print(e)

for row in rows:
    print (row)

Row(keyspace_name='system', type='keyspace', name='system')
Row(keyspace_name='system_auth', type='keyspace', name='system_auth')
Row(keyspace_name='system_distributed', type='keyspace', name='system_distributed')
Row(keyspace_name='system_schema', type='keyspace', name='system_schema')
Row(keyspace_name='system_traces', type='keyspace', name='system_traces')
Row(keyspace_name='system_views', type='keyspace', name='system_views')
Row(keyspace_name='system_virtual_schema', type='keyspace', name='system_virtual_schema')


### Close the session and cluster connection

In [53]:
session.shutdown()
cluster.shutdown()