<h1 align="center" style="background-color:#616161;color:white">Scikit-Learn models</h1>

Contains:
- Baseline model
- Linear SVM
- RBF SVM
- Logistic Regression

<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [3]:
# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/git/EventPrediction" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [4]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging
from random import *

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random
import importlib
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(filename='RNN.log',level=logging.DEBUG)

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_Codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.metrics import recall_score

print ('Ok')

Ok


<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [3]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.python.framework import ops
ops.reset_default_graph()
from sklearn import metrics
from sklearn import preprocessing
print ('Ok')

Ok


<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [4]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath_sml']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']
periodGranularity = int(settingsDict['periodGranularity'])
print ('Ok')

Ok


<h3 style="background-color:#616161;color:white">1. Define Models</h3>

In [1]:
def getData(fieldList,tblName):
    con = sqlite3.connect(dbPath)
    c = con.cursor()
    # Get list of UserIDs
    _df = pd.read_sql_query("Select {} from {}".format(fieldList,tblName),con)
    _x = _df.drop(['t'], 1).values
    _y = _df['t'].values.astype(int)
    con.close()
    return _x, _y

def getSample(_x,_y, _sampleSize):
    idx = np.random.choice(np.arange(len(_x)), _sampleSize, replace=False)
    _xSample = x[idx]
    _ySample = y[idx]
    return _xSample,_ySample

In [35]:
def Baseline(x,y):

    print('\n--------------------------------------------------------')
    print('3. Baseline')
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))
    prec= np.zeros(5)
    rec = np.zeros(5)
    
    i=0
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        
        pred = xTest[:,0]
        prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary') 
        
        #print (metrics.classification_report(yTest,pred))
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return

In [36]:
def RBFKernel(x,y):

    print('\n--------------------------------------------------------')
    print('3. SVM- RBF Kernel')
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = svm.SVC(kernel='rbf', C=1, random_state=0)
    # Increase weight where t-1 is 0 and t is 1
    sampleWeights =  1+(y[:] == 1) * (x[:,1] ==0)
    scoring = ['precision_macro', 'recall_macro']
    scores = cross_validate(clf, x, y, scoring=scoring,
    cv=5, return_train_score=False, n_jobs=-1,fit_params={'sample_weight': sampleWeights})
    #cv=5, return_train_score=False, n_jobs=-1)
    

    pMn=round(scores['test_precision_macro'].mean(),3)
    pSd=round(scores['test_precision_macro'].std(),3)
    rMn=round(scores['test_recall_macro'].mean(),3)
    rSd=round(scores['test_recall_macro'].std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    #predicted = cross_val_predict(clf, x, y, cv=5, n_jobs=-1)
    #print(metrics.roc_auc_score(y,predicted))  
    #print(metrics.classification_report(y,predicted))  # Need to feed it yTest not yTest_OneHot here
    return scores

In [5]:
def LinearKernel(x,y,weighted):

    print('\n--------------------------------------------------------')
    print('3. Linear Kernel (weighted = {})'.format(weighted))
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = svm.SVC(kernel='linear', C=1, random_state=0)
    # Increase weight where t-1 is 0 and t is 1
    
    prec= np.zeros(5)
    rec = np.zeros(5)
    
    i=0
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        
        clf.fit(xTrain,yTrain)
        pred = clf.predict(xTest)
        
        if weighted:
            sampleWeights =  1+(yTest[:] == 1) * (xTest[:,1] ==0)
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary',sample_weight=sampleWeights) 
        else:
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary') 
        
        #print (metrics.classification_report(yTest,pred))
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    coeffs = np.reshape(np.round(clf.coef_,5),(-1,1))
    coeffs=np.concatenate((np.reshape(fieldList.split(',')[1:],(-1,1)),coeffs),axis=1)
    print(pd.DataFrame(coeffs,columns=['Field','Coeff']))
    
    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return

In [44]:
def RBFKernel(x,y,weighted):

    print('\n--------------------------------------------------------')
    print('4. RBF Kernel (weighted = {})'.format(weighted))
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = svm.SVC(kernel='rbf', C=1, random_state=0)
    # Increase weight where t-1 is 0 and t is 1
    
    prec= np.zeros(5)
    rec = np.zeros(5)
    
    i=0
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        
        clf.fit(xTrain,yTrain)
        pred = clf.predict(xTest)
        
        if weighted:
            sampleWeights =  1+(yTest[:] == 1) * (xTest[:,1] ==0)
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary',sample_weight=sampleWeights) 
        else:
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary') 
        
        #print (metrics.classification_report(yTest,pred))
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))
    
    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return

In [39]:
def LogisticModel(x,y,weighted):

    print('\n--------------------------------------------------------')
    print('4. Logistic Model (weighted = {})'.format(weighted))
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = LogisticRegression(C=1,class_weight ='balanced')
    # Increase weight where t-1 is 0 and t is 1
    sampleWeights =  1+(y[:] == 1) * (x[:,1] ==0)
    k=5
    prec= np.zeros(k)
    rec = np.zeros(k)
    i=0
    
    kf = KFold(n_splits=k)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        clf.fit(xTrain,yTrain)
        pred = clf.predict(xTest)
        
        if weighted:
            sampleWeights =  1+(yTest[:] == 1) * (xTest[:,1] ==0)
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary',sample_weight=sampleWeights)
            #print (metrics.classification_report(yTest,pred))
        else:
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary')
            #print (metrics.classification_report(yTest,pred))
        
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    coeffs = np.reshape(np.round(clf.coef_,5),(-1,1))
    coeffs=np.concatenate((np.reshape(fieldList.split(',')[1:],(-1,1)),coeffs),axis=1)
    print(pd.DataFrame(coeffs,columns=['Field','Coeff']))
    
    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return

In [46]:
sampleSize = 100000
fieldList="t, t1,t2, t3,t4,t5 t23_5hrs,t24hrs,t24_5hrs, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat"
#fieldList="t,t1,1"
x,y = getData(fieldList,'tblTimeSeriesData')
x,y = getSample(x,y,sampleSize)

# Run modesl -- uncomment as needed

#Baseline(x,y)

#LinearKernel(x,y,False)
#LinearKernel(x,y,True)

LogisticModel2(x,y,False)
LogisticModel(x,y,True)

#RBFKernel2(x,y,False)
#RBFKernel(x,y,True)



--------------------------------------------------------
3. Linear Kernel (weighted = True)
--------------------------------------------------------
Start time 2017-08-29 12:24:34.910695
Av. precision 0.0 +/- 0.0, Av. recall 0.0+/0.0,
          Field   Coeff
0            t1     0.0
1            t2     0.0
2            t3     0.0
3            t4     0.0
4   t5 t23_5hrs  -4e-05
5        t24hrs   8e-05
6      t24_5hrs  -6e-05
7    HrsFrom5pm  -1e-05
8         isSun  -1e-05
9         isMon   5e-05
10        isTue  -2e-05
11        isWed   1e-05
12        isThu   2e-05
13        isFri  -7e-05
14        isSat   3e-05
Time elpased (hh:mm:ss.ms) 0:00:31.068724

--------------------------------------------------------
4. Logistic Model (weighted = True)
--------------------------------------------------------
Start time 2017-08-29 12:25:05.979874
Av. precision 0.069 +/- 0.008, Av. recall 0.512+/0.029,
          Field     Coeff
0            t1       0.0
1            t2       0.0
2            t3