# "Hallelujah Effect" Analysis

This notebook models the "Hallelujah Effect" in terms of all basic features available in the dataset for those subjects that listened to the song and had an EDA quality >80%.

In [140]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np
import shutil
import pandas as pd

In [7]:
# Set bucket, project, and region
BUCKET = 'eim-muse'
PROJECT = 'eim-muse'
REGION = 'us-central1'

In [8]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [9]:
%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


## Retrieve and Subset Datasource

Get data from BigQuery but defer filtering, etc. to Beam. Data in BigQuery has been pre-processed with Dataprep.

In [10]:
import google.datalab.bigquery as bq
def create_query(phase, EVERY_N):
  """
  phase: 1=train 2=valid
  """
  base_query = """
SELECT *
FROM
  `eim-muse.hallelujah_effect.full_hallelujah_trials_cleaned`
  """

  if EVERY_N == None:
    if phase < 2:
      # Training
      query = "{0} WHERE MOD(FARM_FINGERPRINT(id), 10) < 7".format(base_query)
    else:
      # Validation
      query = "{0} WHERE MOD(FARM_FINGERPRINT(id), 10) >= 7".format(base_query)
  else:
      query = "{0} WHERE MOD(FARM_FINGERPRINT(id), {1}) = {2}".format(base_query, EVERY_N, phase)
    
  return query

query = create_query(1, None)

In [11]:
df_valid = bq.Query(query).execute().result().to_dataframe()
df_valid.head()
df_valid.describe()

Unnamed: 0,age,concentration,musical_expertise,artistic,fault,imagination,lazy,nervous,outgoing,reserved,...,music_pref_none,music_pref_hiphop,music_pref_dance,music_pref_world,music_pref_rock,music_pref_pop,music_pref_classical,music_pref_jazz,music_pref_folk,music_pref_traditional_irish
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,...,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,24.726073,3.9935,2.547224,2.344099,3.175612,3.852643,3.690403,3.629726,3.213016,3.145503,...,0.006601,0.138614,0.188119,0.132013,0.432343,0.673267,0.306931,0.171617,0.089109,0.059406
std,13.931034,0.795258,1.009324,0.988361,0.891464,0.82162,0.905895,0.875898,0.961299,0.891699,...,0.08111,0.346115,0.391454,0.339065,0.496221,0.469794,0.461983,0.377671,0.285372,0.236774
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16.0,3.991266,2.0,2.0,3.0,3.824561,3.659389,3.596491,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,21.0,3.991266,2.52988,2.353712,3.144737,3.824561,3.659389,3.596491,3.22807,3.117904,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,31.0,4.0,3.0,2.353712,4.0,4.0,4.0,4.0,4.0,4.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
max,121.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
train_query = create_query(1, None)
train_n = len(list(bq.Query(train_query).execute().result()))

eval_query = create_query(2, None)
eval_n = len(list(bq.Query(eval_query).execute().result()))

os.environ['TRAIN_N'] = str(train_n)
os.environ['EVAL_N'] = str(eval_n)

print('{} training examples / {} evaluation examples'.format(train_n, eval_n))

303 training examples / 61 evaluation examples


In [13]:
df_valid.columns

Index([u'id', u'age', u'concentration', u'hearing_impairments',
       u'musical_expertise', u'nationality', u'artistic', u'fault',
       u'imagination', u'lazy', u'nervous', u'outgoing', u'reserved',
       u'stress', u'thorough', u'trusting', u'activity', u'engagement',
       u'familiarity', u'like_dislike', u'positivity', u'tension', u'sex',
       u'hallelujah_reaction', u'location', u'language', u'music_pref_none',
       u'music_pref_hiphop', u'music_pref_dance', u'music_pref_world',
       u'music_pref_rock', u'music_pref_pop', u'music_pref_classical',
       u'music_pref_jazz', u'music_pref_folk',
       u'music_pref_traditional_irish'],
      dtype='object')

In [14]:
df_train = bq.Query(train_query).execute().result().to_dataframe()
df_train.head()
df_train.describe()

Unnamed: 0,age,concentration,musical_expertise,artistic,fault,imagination,lazy,nervous,outgoing,reserved,...,music_pref_none,music_pref_hiphop,music_pref_dance,music_pref_world,music_pref_rock,music_pref_pop,music_pref_classical,music_pref_jazz,music_pref_folk,music_pref_traditional_irish
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,...,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,24.726073,3.9935,2.547224,2.344099,3.175612,3.852643,3.690403,3.629726,3.213016,3.145503,...,0.006601,0.138614,0.188119,0.132013,0.432343,0.673267,0.306931,0.171617,0.089109,0.059406
std,13.931034,0.795258,1.009324,0.988361,0.891464,0.82162,0.905895,0.875898,0.961299,0.891699,...,0.08111,0.346115,0.391454,0.339065,0.496221,0.469794,0.461983,0.377671,0.285372,0.236774
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16.0,3.991266,2.0,2.0,3.0,3.824561,3.659389,3.596491,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,21.0,3.991266,2.52988,2.353712,3.144737,3.824561,3.659389,3.596491,3.22807,3.117904,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,31.0,4.0,3.0,2.353712,4.0,4.0,4.0,4.0,4.0,4.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
max,121.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
df_eval = bq.Query(eval_query).execute().result().to_dataframe()
df_eval.head()
df_eval.describe()

Unnamed: 0,age,concentration,musical_expertise,artistic,fault,imagination,lazy,nervous,outgoing,reserved,...,music_pref_none,music_pref_hiphop,music_pref_dance,music_pref_world,music_pref_rock,music_pref_pop,music_pref_classical,music_pref_jazz,music_pref_folk,music_pref_traditional_irish
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,24.016393,3.98017,2.443733,2.40146,2.991372,3.685073,3.505333,3.431406,3.302847,2.980815,...,0.016393,0.114754,0.213115,0.147541,0.409836,0.704918,0.327869,0.213115,0.04918,0.065574
std,11.491289,0.805997,0.985933,0.828365,0.861184,0.910025,0.947435,0.848395,0.881794,0.878552,...,0.128037,0.32137,0.412907,0.357588,0.495885,0.459865,0.473333,0.412907,0.218039,0.24959
min,5.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,3.991266,2.0,2.0,3.0,3.824561,3.659389,3.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,21.0,3.991266,2.52988,2.353712,3.144737,3.824561,3.659389,3.596491,3.22807,3.117904,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,32.0,4.0,3.0,3.0,3.144737,4.0,4.0,4.0,4.0,3.117904,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
max,56.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [82]:
def preprocess_df(df, scaling_dict=None):

  updated_df = df.drop(columns=['id', 'hallelujah_reaction'])

  updated_df = pd.concat(
    [
      updated_df.drop(columns=['location', 'nationality', 'sex', 'language']), 
      pd.get_dummies(updated_df['location'], prefix='location'),
      pd.get_dummies(updated_df['nationality'], prefix='nationality'),
      pd.get_dummies(updated_df['sex'], prefix='sex'),
      pd.get_dummies(updated_df['language'], prefix='language')
    ], 
    axis=1
  )

  updated_df = updated_df.apply(pd.to_numeric)

  if scaling_dict is None:
    scaling_dict = {}
    for column in updated_df.columns:
      scaling_dict[column] = {
        'max': updated_df[column].max(),
        'min': updated_df[column].min()
      }
  
  for column in updated_df.columns:
    if column not in scaling_dict.keys():
      updated_df = updated_df.drop(columns=[column])
    else:
      updated_df[column] = np.subtract(updated_df[column], scaling_dict[column]['min'], dtype=np.float32)
      updated_df[column] = updated_df[column] / np.subtract(scaling_dict[column]['max'], scaling_dict[column]['min'], dtype=np.float32)

  return updated_df, scaling_dict

In [99]:
df_train_X, df_train_scaling_dict = preprocess_df(df_train)
df_train_y = df_train['hallelujah_reaction']
df_eval_X, _ = preprocess_df(df_eval, scaling_dict=df_train_scaling_dict)
df_eval_y = df_eval['hallelujah_reaction']

# Remove columns from training data that do not exist in eval data
all_columns = list(set(list(df_train_X.columns) + list(df_eval_X.columns)))
good_columns = [column for column in all_columns if column in df_train_X.columns and column in df_eval_X.columns]
df_train_X = df_train_X[good_columns]
df_eval_X = df_eval_X[good_columns]

In [137]:
clf = SVC(C=10000., probability=True, class_weight='balanced', verbose=True)
clf.fit(df_train_X, df_train_y)

[LibSVM]

SVC(C=10000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [138]:
eval_true = df_eval_y
eval_pred = clf.predict(df_eval_X)

f1_score(eval_true, eval_pred)

0.41025641025641024

In [144]:
tn, fp, fn, tp = confusion_matrix(eval_true, eval_pred).ravel()
print("True negatives: {}".format(tn))
print("True positives: {}".format(tp))
print("False negatives: {}".format(fn))
print("False positives: {}".format(fp))

True negatives: 30
True positives: 8
False negatives: 12
False positives: 11
