In [1]:
import psycopg2

In [2]:
conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
cur = conn.cursor()

_Validations on the raw data to understand the data that was loaded in to the tables. Run this Notebook after etl.py has been executed_

### Describing Songs Data set

In [5]:
cur.execute('SELECT COUNT(1), COUNT(DISTINCT title) FROM songs')
output = cur.fetchone()
print(output)

(71, 71)


In [7]:
cur.execute('SELECT MIN(duration), MAX(duration), AVG(duration) FROM songs')
output = cur.fetchone()
print(output)

(Decimal('29.54404'), Decimal('599.24853'), Decimal('239.7296760563380282'))


##### There are a total of 71 songs with songs playing on average 4 mins. Quickest song is a half minute and longest song is 10 mins

In [9]:
cur.execute('SELECT COUNT(DISTINCT artist_id) FROM songs')
output = cur.fetchone()
print(output)

(69,)


##### Since each song is created by one and only one artist, the 71 songs in the data are almost all created by different artists

In [11]:
cur.execute('SELECT COUNT(1) FROM songs WHERE year = 0')
output = cur.fetchone()
print(output)
cur.execute('SELECT year, COUNT(1) FROM songs WHERE year != 0 GROUP BY year ORDER BY year')
output = cur.fetchall()
print(output)

(43,)
[(1961, 1), (1964, 1), (1969, 1), (1972, 1), (1982, 1), (1984, 1), (1985, 1), (1986, 1), (1987, 1), (1992, 1), (1993, 1), (1994, 2), (1997, 2), (1999, 1), (2000, 2), (2003, 2), (2004, 4), (2005, 2), (2007, 1), (2008, 1)]


##### The data quality in the `year` column is quite bad. About 2/3rds of songs have no valid year. Of the remaining 1/3rd songs with a valid year, they range from early 60's to late 2000's

In [28]:
cur.execute("SELECT COUNT(DISTINCT location) FROM artists WHERE location != ''")
output = cur.fetchone()
print(output)
cur.execute("SELECT location, COUNT(1) FROM artists WHERE location != '' GROUP BY location ORDER BY 2 desc LIMIT 1")
output = cur.fetchone()
print(output)

(40,)
('London, England', 2)


##### Of the artists that has valid location data, all are from different cities around world except the 2 from London. Nothing interesting here

### Describing Log data

In [29]:
cur.execute("SELECT COUNT(1), COUNT(DISTINCT song), COUNT(DISTINCT artist), COUNT(DISTINCT userId) FROM songplays_log")
output = cur.fetchone()
print(output)

(6820, 5189, 3148, 96)


#### The log data contains ~6800 plays by 100 users of 5200 different songs from 3150 distinct artists. Note that the dimension tables `songs` and `artists` contain only a small sample of the overall data

In [30]:
cur.execute("SELECT song, COUNT(1) FROM songplays_log GROUP BY song ORDER BY 2 desc LIMIT 1;")
output = cur.fetchone()
print(output)

("You're The One", 37)


In [38]:
cur.execute("SELECT DISTINCT song, artist FROM songplays_log WHERE song = \
                (SELECT song FROM \
                    (SELECT song, COUNT(1) FROM songplays_log GROUP BY song ORDER BY 2 desc LIMIT 1) m)")
output = cur.fetchone()
print(output)

("You're The One", 'Dwight Yoakam')


#### Dwight Yoakam's You're The One is the most played song

In [41]:
cur.execute("SELECT u.gender, COUNT(DISTINCT sp.user_id), COUNT(1) \
                FROM songplays sp JOIN users u on sp.user_id = u.user_id \
                GROUP BY u.gender;")
output = cur.fetchall()
print(output)

[('F', 55, 4887), ('M', 41, 1933)]


#### Female users make up 55% of the user base and constitute a whopping 72% of plays

In [42]:
cur.execute("SELECT u.level, COUNT(DISTINCT sp.user_id), COUNT(1) \
                FROM songplays sp JOIN users u on sp.user_id = u.user_id \
                GROUP BY u.level;")
output = cur.fetchall()
print(output)

[('free', 75, 1385), ('paid', 21, 5435)]


#### Free users make up 75% of the user base but constitute only 20% of the plays. Paid users are engaged which could indicate a healthy content

In [45]:
cur.execute("SELECT t.weekday, COUNT(DISTINCT sp.user_id), COUNT(1) \
                FROM songplays sp JOIN time t on sp.start_time = t.start_time \
                GROUP BY t.weekday;")
output = cur.fetchall()
print(output)

[(0, 59, 1014), (1, 57, 1071), (2, 60, 1364), (3, 56, 1052), (4, 63, 1295), (5, 45, 628), (6, 39, 396)]


#### Our users are more engaged during the weekdays than the weekend