In [1]:
import pandas as pd
import numpy as np
import random
import json

# Replace double quotes with single quotes for moods JSON arrays
f = open('song_data.csv','r+')
text = f.read()
text = text.replace('"["', '"[?').replace('", "', '?,?').replace('"]"', '?]"') # Changed to "[$Happy$,$Sad$]" for easy replacing later
f.seek(0)
f.write(text)
f.close()

# Importing data
df = pd.read_csv('song_data.csv', index_col=0)
for col in df.columns:
    if col=='uuid' or 'gyro' in col or 'accel' in col:
        df.drop(col, axis=1, inplace=True)
    elif col not in ['id','isSkipped']:
        if col == 'moods':
            df[col] = df[col].apply(lambda x:x.replace('?','"'))
        df[col] = df[col].apply(json.loads)
df['activity'] = df.apply(lambda _: random.choice(['Running','Walking','Working']), axis=1) # initialise random activity
df.head()
# df['activity'].value_counts()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"[139.64, 138.36, 139.64, 140.28]","[30.50567626953125, 30.50567626953125, 30.5056...","[71.3134765625, 71.3134765625, 71.3134765625]","[Depressive, Atmospheric]",0,Working
2,"[123.24, 123.24, 139.32, 228.64]","[31.69403076171875, 31.69403076171875, 31.6940...","[67.05322265625, 67.05322265625, 67.0532226562...",[Depressive],0,Walking
3,"[256.08, 307.84000000000003, 315.2, 301.36, 30...","[32.21771240234375, 32.21771240234375, 32.2177...","[65.850830078125, 65.850830078125, 65.85083007...","[Passionate, Depressive]",0,Walking
4,"[127.08, 126.76, 125.48, 124.52]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...",[Elegant],1,Running
5,"[145.76, 144.48, 146.4, 144.8]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...","[Passionate, Depressive]",1,Walking


In [2]:
# Filtering defective data

defective_ids = []
for id,row in df.iterrows():
    # defective if temp array only has -40 values
    # defective if any humidity values are above 99.99
    if len([k for k in row['temp'] if k==-40]) == len(row['temp']) or \
       len([k for k in row['humidity'] if k>99.99]) == len(row['humidity']):
        defective_ids.append(id)
    
    # if only some values are defective, keep the row, but remove defective values
    # remove -40 temp values and >99.99 humidity values
    elif (-40 in row['temp']) or len([k for k in row['humidity'] if k>99.99]):
        row['temp'] = [k for k in row['temp'] if k!=-40]
        row['humidity'] = [k for k in row['humidity'] if k<99.99]

filtered_df = df[~df.index.isin(defective_ids)].copy() # .copy() to avoid warning
print('Defective row ids are: ',defective_ids)

Defective row ids are:  [21, 22, 33, 214, 236, 238, 245, 246, 247, 248, 249, 250, 251, 252]


In [3]:
# Obtain mean optical, temp and humidity values

for col in df.columns:
    if col not in ['moods','isSkipped','activity']:
        filtered_df[col] = filtered_df[col].apply(np.mean)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0,Working
2,153.61,31.694031,67.053223,[Depressive],0,Walking
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0,Walking
4,125.96,32.429199,64.672852,[Elegant],1,Running
5,145.36,32.429199,64.672852,"[Passionate, Depressive]",1,Walking


In [4]:
# One-hot encoding for moods

moods = []
for k in filtered_df['moods'].values:
    moods += list(k)
moods = np.unique(np.array(moods))
for mood in moods:
    mood_values = filtered_df['moods'].astype(str).str.contains(mood)
    filtered_df[mood] = mood_values
filtered_df.drop('moods', axis=1, inplace=True)

filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,isSkipped,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,139.48,30.505676,71.313477,0,Working,False,False,True,False,True,False,False,False
2,153.61,31.694031,67.053223,0,Walking,False,False,False,False,True,False,False,False
3,297.792,32.217712,65.85083,0,Walking,False,False,False,False,True,False,True,False
4,125.96,32.429199,64.672852,1,Running,False,False,False,False,False,True,False,False
5,145.36,32.429199,64.672852,1,Walking,False,False,False,False,True,False,True,False


In [5]:
# Invert mood boolean values based on "isSkipped"

for mood in moods:
    filtered_df[mood] = np.abs(filtered_df[mood] - filtered_df['isSkipped'])
filtered_df.drop('isSkipped', axis=1, inplace=True)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,139.48,30.505676,71.313477,Working,0,0,1,0,1,0,0,0
2,153.61,31.694031,67.053223,Walking,0,0,0,0,1,0,0,0
3,297.792,32.217712,65.85083,Walking,0,0,0,0,1,0,1,0
4,125.96,32.429199,64.672852,Running,1,1,1,1,1,0,1,1
5,145.36,32.429199,64.672852,Walking,1,1,1,1,0,1,0,1


In [7]:
!pip install pycaret

Collecting pycaret

  ERROR: Command errored out with exit status 1:
   command: 'C:\Python310\python.exe' 'C:\Users\rage8\AppData\Local\Temp\pip-standalone-pip-1owsyhid\__env_pip__.zip\pip' install --ignore-installed --no-user --prefix 'C:\Users\rage8\AppData\Local\Temp\pip-build-env-_19y0qe9\overlay' --no-warn-script-location --no-binary :none: --only-binary :none: -i https://pypi.org/simple -- wheel setuptools 'Cython>=0.29.18' 'numpy==1.14.5; python_version=='"'"'3.6'"'"' and platform_system!='"'"'AIX'"'"'' 'numpy==1.14.5; python_version=='"'"'3.7'"'"' and platform_system!='"'"'AIX'"'"'' 'numpy==1.17.3; python_version>='"'"'3.8'"'"' and platform_system!='"'"'AIX'"'"'' 'numpy==1.16.0; python_version=='"'"'3.6'"'"' and platform_system=='"'"'AIX'"'"'' 'numpy==1.16.0; python_version=='"'"'3.7'"'"' and platform_system=='"'"'AIX'"'"'' 'numpy==1.17.3; python_version>='"'"'3.8'"'"' and platform_system=='"'"'AIX'"'"'' 'pybind11>=2.4.3'
       cwd: None
  Complete output (2475 lines):
  Ignoring numpy: markers 


  Using cached pycaret-2.3.4-py3-none-any.whl (266 kB)
Collecting seaborn
  Using cached seaborn-0.11.2-py3-none-any.whl (292 kB)
Collecting cufflinks>=0.17.0
  Using cached cufflinks-0.17.3-py3-none-any.whl
Collecting numpy==1.19.5
  Using cached numpy-1.19.5.zip (7.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing wheel metadata (pyproject.toml): started
  Preparing wheel metadata (pyproject.toml): finished with status 'done'
Collecting imbalanced-learn==0.7.0
  Using cached imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting numba<0.54
  Using cached numba-0.51.2.tar.gz (2.1 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyLDAvis
  Using cached 


    Preparing metadata (setup.py): started
    Preparing metadata (setup.py): finished with status 'done'
  Collecting pybind11>=2.4.3
    Using cached pybind11-2.8.0-py2.py3-none-any.whl (207 kB)
  Building wheels for collected packages: numpy
    Building wheel for numpy (setup.py): started
    Building wheel for numpy (setup.py): finished with status 'error'
    ERROR: Command errored out with exit status 1:
     command: 'C:\Python310\python.exe' -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\rage8\\AppData\\Local\\Temp\\pip-install-_5v1lcc6\\numpy_38b0c38a9f3f4eee969ff2aa37fabada\\setup.py'"'"'; __file__='"'"'C:\\Users\\rage8\\AppData\\Local\\Temp\\pip-install-_5v1lcc6\\numpy_38b0c38a9f3f4eee969ff2aa37fabada\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code

    _configtest.obj : error LNK2001: unresolved external symbol hypotl
    _configtest.obj : error LNK2001: unresolved external symbol fmodl
    _configtest.obj : error LNK2001: unresolved external symbol tanl
    _configtest.obj : error LNK2001: unresolved external symbol fabsl
    _configtest.obj : error LNK2001: unresolved external symbol cosl
    _configtest.obj : error LNK2001: unresolved external symbol atan2l
    _configtest.exe : fatal error LNK1120: 23 unresolved externals
    failure.
    removing: _configtest.c _configtest.obj _configtest.obj.d
    C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Inumpy\core\src\common -Inumpy\core\src -Inumpy\core -Inumpy\core\src\npymath -Inumpy\core\src\multiarray -Inumpy\core\src\umath -Inumpy\core\src\npysort -IC:\Python310\include -IC:\Python310\Include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30

    failure.
    removing: _configtest.c _configtest.obj _configtest.obj.d
    C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Inumpy\core\src\common -Inumpy\core\src -Inumpy\core -Inumpy\core\src\npymath -Inumpy\core\src\multiarray -Inumpy\core\src\umath -Inumpy\core\src\npysort -IC:\Python310\include -IC:\Python310\Include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\ATLMFC\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\include -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\shared -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\winrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\cppwinrt /Tc_configtest.c /Fo_configtes

    blis_info:
    No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
    customize MSVCCompiler
      libraries blis not found in ['C:\\Python310\\lib', 'C:\\', 'C:\\Python310\\libs']
      NOT AVAILABLE
  
    openblas_info:
    No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
    customize MSVCCompiler
    No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
    customize MSVCCompiler
      libraries openblas not found in ['C:\\Python310\\lib', 'C:\\', 'C:\\Python310\\libs']
    get_default_fcompiler: matching types: '['gnu', 'intelv', 'absoft', 'compaqv', 'intelev', 'gnu95', 'g95', 'intelvem', 'intelem', 'flang']'
    customize GnuFCompiler
    Could not locate executable g77
    Could not locate executable f77
    customize IntelVisualFCompiler
    Could not locate executable ifort
    Could not locate executable ifl
    customize AbsoftFCompiler
    Could not locat

    customize AbsoftFCompiler
    Could not locate executable f90
    customize CompaqVisualFCompiler
    Could not locate executable DF
    customize IntelItaniumVisualFCompiler
    Could not locate executable efl
    customize Gnu95FCompiler
    Could not locate executable gfortran
    Could not locate executable f95
    customize G95FCompiler
    Could not locate executable g95
    customize IntelEM64VisualFCompiler
    customize IntelEM64TFCompiler
    Could not locate executable efort
    Could not locate executable efc
    customize PGroupFlangCompiler
    Could not locate executable flang
    don't know how to compile Fortran code on platform 'nt'
      NOT AVAILABLE
  
    atlas_3_10_blas_threads_info:
    Setting PTATLAS=ATLAS
    No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
    customize MSVCCompiler
      libraries tatlas not found in ['C:\\Python310\\lib', 'C:\\', 'C:\\Python310\\libs']
      NOT AVAILABLE
  
    atlas_3_10_blas_

      Could not locate executable DF
      customize IntelItaniumVisualFCompiler
      Could not locate executable efl
      customize Gnu95FCompiler
      Could not locate executable gfortran
      Could not locate executable f95
      customize G95FCompiler
      Could not locate executable g95
      customize IntelEM64VisualFCompiler
      customize IntelEM64TFCompiler
      Could not locate executable efort
      Could not locate executable efc
      customize PGroupFlangCompiler
      Could not locate executable flang
      don't know how to compile Fortran code on platform 'nt'
        NOT AVAILABLE
  
      atlas_3_10_blas_threads_info:
      Setting PTATLAS=ATLAS
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries tatlas not found in ['C:\\Python310\\lib', 'C:\\', 'C:\\Python310\\libs']
        NOT AVAILABLE
  
      atlas_3_10_blas_info:
      No module named 'numpy.distutils._msvccompil

    copying numpy\lib\mixins.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\nanfunctions.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\npyio.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\polynomial.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\recfunctions.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\scimath.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\setup.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\shape_base.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\stride_tricks.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\twodim_base.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\type_check.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\ufunclike.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\user_array.py -> build\lib.win-amd64-3.10\numpy\lib
    copying numpy\lib\utils.py -> build\lib.win-

    C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\bin\HostX86\x64\link.exe /nologo /INCREMENTAL:NO /LTCG /MANIFEST:EMBED,ID=1 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\ATLMFC\lib\x64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\lib\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\ucrt\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\um\x64 _configtest.obj /OUT:_configtest.exeIOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

