In [1]:
import os
import sys
import pandas as pd
import csv
import json
import sqlite3
import logging

## Parameters you can change

# Abs path to settings file
root = "C:/DS/Github/MusicRecommendation"  # BA, Windows

# Select the dataset to view
datasetToUse ="inputfile" # inputfile , inputfile_sml , or inputfile_verysml


## Finish setting up
os.chdir(root)
## Add the prelim module
fPath = root + "/1_codebase"
if fPath not in sys.path: sys.path.append(fPath)
from codebase import *

settingsDict =  loadSettings()
inputFilePath = root + settingsDict[datasetToUse]
dbPath = root + settingsDict['dbPath']
logging.basicConfig(filename='C:\DS\Github\example.log',level=logging.DEBUG)

def dropCreateTable(dbCursor,tblName, tblSelect):
    sqlStr= 'DROP TABLE IF EXISTS {}'.format(tblName)
    c.execute(sqlStr)

    sqlStr ="CREATE TABLE {}".format(tblName) + " AS " + tblSelect
    c.execute(sqlStr)

<a id='Load Data'></a>
<h3 style="background-color:#616161;color:white">Load Data Into SQLite3</h3>

In [2]:
db = sqlite3.connect(dbPath)

colHeadings=['UserID','PlayedTimestamp','ArtistID','ArtistName','TrackID','TrackName']
dataFormat={'UserID': str, 'PlayedTimestamp': str, 'ArtistID' :str, 'ArtistName': str, 'TrackID': str, 'TrackName': str}
newFields = 'FirstPlayed, MinsSinceFirstPlay integer, MinsSincePrevPlay integer, historyID integer'
tblMain_SQL = 'CREATE TABLE tblMain (UserID integer, PlayedTimestamp text, ArtistID text, ArtistName text, TrackID text, TrackName text, ' + newFields +')';

parse_dates = ['PlayedTimestamp']

# Load data from CSV
inpData = pd.read_csv(inputFilePath, sep='\t', error_bad_lines= False,quoting=csv.QUOTE_NONE, header=None,names=colHeadings, dtype=dataFormat,parse_dates=parse_dates)
inpData.to_sql('tblInputData', db, flavor='sqlite',
                                            schema=None, if_exists='replace', index=True,
                                            index_label=None, chunksize=None, dtype=None)

# Create tblMain
c=db.cursor()
sqlStr= 'DROP TABLE IF EXISTS tblMain'
c.execute(sqlStr)
c.execute(tblMain_SQL)

db.close()

<a id='Preprocessing'></a>
<h3 style="background-color:#616161;color:white">Preprocessing</h3>

<h4 style="background-color:#616161;color:white">Create aggregated tables</h4>

In [3]:
db = sqlite3.connect(dbPath)
c = db.cursor()

dropCreateTable(c,'tblAgg_UserDailyPlays', 'SELECT Cast(substr(userID,-5) as integer) as userID, date(PlayedTimestamp),count(*) as NumOfPlays, count(Distinct trackID) as NumOfTracks from tblInputData group by userID, date(PlayedTimestamp) ORDER BY NumOfPlays')
dropCreateTable(c,'tblAgg_User', 'SELECT Cast(substr(userID,-5) as integer) as userID, min(PlayedTimestamp) as FirstPlay from tblInputData Group by Cast(substr(userID,-5) as integer)')
db.close()

<h4 style="background-color:#616161;color:white">Load data into tblMain</h4>

In [4]:
db = sqlite3.connect(dbPath)
c = db.cursor()
sqlStr ="SELECT CAST(substr(tblInputData.userID,-5) as integer) as UserID, (strftime('%s',tblInputData.PlayedTimestamp) - strftime('%s',tblAgg_User.FirstPlay))/60 as MinsSinceFirstPlay, tblAgg_User.FirstPlay, PlayedTimestamp, ArtistID, ArtistName, TrackID, TrackName from tblInputData Inner join tblAgg_User ON Cast(substr(tblInputData.userID,-5) as integer) = tblAgg_User.userID order by UserID,MinsSinceFirstPlay"

rowCount = 1
userID=0
MinsSinceFirstPlay = ""
for row in c.execute(sqlStr):
    
    tmp=list(row)
    
    
    if tmp[0] != userID: 
        # Start of a new user
        rowCount = 1
        userID = int(row[0])
        MinsSinceFirstPlay = tmp[1]
        MinsSincePrevPlay = MinsSinceFirstPlay
    else:
        MinsSincePrevPlay = tmp[1] - MinsSinceFirstPlay
        MinsSinceFirstPlay = tmp[1] 
    
    
    tmp.append(str(rowCount))
    tmp.append(str(MinsSincePrevPlay))
    
    tmp = ['Null' if v is None else v for v in tmp]
    
    #tmp = [v.replace("\'S","''S") if type(v)=='str' else v for v in tmp]
    tmp=tuple(tmp)
    #insertStr = "Insert into tblMain (userID, MinsSinceFirstPlay, PlayedTimeStamp,ArtistID, ArtistName, TrackID, TrackName,historyID) Values ({}, {}, {},'{}', '{}', '{}', '{}',{})".format(t for t in tmp)
    t=str(tmp).replace("\\\'S","''S")
    
    
    insertStr = "Insert into tblMain (userID, MinsSinceFirstPlay, FirstPlayed, PlayedTimeStamp, ArtistID, ArtistName, TrackID, TrackName,historyID, MinsSincePrevPlay) Values " + t
    
    d=db.cursor()
    
    try:
        d=db.cursor()
        d.execute(insertStr)
        rowCount +=1
    except:
        logging.warning(str(row))
        logging.warning(insertStr)
        
        
db.commit()
db.close()

In [5]:
db.close()