# Data Wrangle

#### DO NOT RUN IT WITHOUT GOING THROUGH THE FULL FILE

### Libraries used in the test

In [13]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [14]:
import os
import glob
import json

import boto3
import configparser
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import numpy as np
import html
import re 

In [15]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
KEY=config.get('AWS','key')
SECRET= config.get('AWS','secret')

DWH_DB= config.get("CLUSTER","DWH_DB")
DWH_DB_USER= config.get("CLUSTER","DWH_DB_USER")
DWH_DB_PASSWORD= config.get("CLUSTER","DWH_DB_PASSWORD")
DWH_PORT = config.get("CLUSTER","DWH_PORT")

## Gather

In [16]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )

sampleDbBucket =  s3.Bucket("udacity-dend")

In [17]:
#Functions
def download_files_from_s3(prefix, limit):
    """
    Summary line. 
    Download files from AWS_S3
  
    Parameters: 
    arg1 (prefix of files to be downloaded)
    arg2 (number of files to be downloaded)
  
    Returns: 
    None
    """    
    i = 0
    files = []    
    tenPercentOfTotal = round(limit/10)
    
    for obj in sampleDbBucket.objects.filter(Prefix=prefix):
        if(len(files) % tenPercentOfTotal == 0 and i>1):
            print('Downloaded {}/{}'.format(len(files), limit) )
            
        if i>limit:
            break
        if obj.key.rsplit('/', 1)[1] != '':        
            folder = obj.key.rsplit('/', 1)[0]
            file= obj.key.rsplit('/', 1)[1]
            files.append(obj.key)
            #print(obj.key)
            os.makedirs(folder, exist_ok=True)  # succeeds even if directory exists.
            sampleDbBucket.download_file(obj.key, obj.key)        
        i+=1
        
def get_all_files(folder):
    """
    Summary line. 
    Scans folder and prepares files list
  
    Parameters: 
    arg1 (folder path)
  
    Returns: 
    Array of filepath
    """     
    # 1. checking your current working directory
    print('Current Working Directory : ',os.getcwd())

    # Get your current folder and subfolder event data
    filepath = os.getcwd() + folder
    print('Scanning Directory : ',filepath)

    # 2. Create a for loop to create a list of files and collect each filepath
    #    join the file path and roots with the subdirectories using glob
    #    get all files matching extension from directory
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))

    # 3. get total number of files found
    num_files = len(all_files)
    print('{} files found'.format(num_files))
    #print(all_files)
    return all_files
        
        
def merge_song_files(folder, mergedfile):        
    """
    Summary line. 
    Merges all song data into one file. Each song file contains only one JSON object.
  
    Parameters: 
    arg1 (folder path)
    arg2 (merged output filename)
  
    Returns: 
    None
    """     
    output_list = []
    all_files = get_all_files(folder)
    if len(all_files) > 0:
        for f in all_files:
            with open(f, "rb") as infile:
                output_list.append(json.load(infile))

        with open(mergedfile, "w", encoding="utf8") as outfile:
            json.dump(output_list, outfile)    

            
def merge_log_files(folder, mergedfile):
    """
    Summary line. 
    Merges all log data into one file. Each log file may contain more than one JSON objects and there will be only one JSON object per line.
  
    Parameters: 
    arg1 (folder path)
    arg2 (merged output filename)
  
    Returns: 
    None
    """         
    output_list = []
    all_files = get_all_files(folder)
    if len(all_files) > 0:
        for f in all_files:
            with open(f, 'r') as f:
                for line in f:
                    output_list.append(json.loads(line))

        with open(mergedfile, "w", encoding="utf8") as outfile:
            json.dump(output_list, outfile)    

            
def json_to_dataframe(infile):
    """
    Summary line. 
    Reads JSON file loads panda datafame
  
    Parameters: 
    arg1 (filename)
  
    Returns: 
    dataframe
    """         
    with open(infile) as datafile:
        data = json.load(datafile)

    df = pd.DataFrame(data)
    print("df shape {}".format(df.shape))
    return df
    
    

In [18]:
%%time
#Getting info on number of files in AWS for prefix song_data
size = sum(1 for _ in sampleDbBucket.objects.filter(Prefix="song_data"))
print(size)

14897
CPU times: user 3.43 s, sys: 24 ms, total: 3.45 s
Wall time: 10.4 s


In [19]:
%%time
#Getting info on number of files in AWS for prefix log_data
size = sum(1 for _ in sampleDbBucket.objects.filter(Prefix="log_data"))
print(size)

31
CPU times: user 14.5 ms, sys: 0 ns, total: 14.5 ms
Wall time: 98.9 ms


In [20]:
#Displaying the files
i = 0
for obj in sampleDbBucket.objects.filter(Prefix="song_data"):
    if i>5:
        break
    #print(obj.key)
    if obj.key.rsplit('/', 1)[1] != '': #condition to ignore folderPath
        print(obj.key.rsplit('/', 1))
    i+=1

['song_data/A/A/A', 'TRAAAAK128F9318786.json']
['song_data/A/A/A', 'TRAAAAV128F421A322.json']
['song_data/A/A/A', 'TRAAABD128F429CF47.json']
['song_data/A/A/A', 'TRAAACN128F9355673.json']
['song_data/A/A/A', 'TRAAAEA128F935A30D.json']


In [21]:
%%time
#This takes almost 1hr 30mins - So commenting it out
#download_files_from_s3('song_data', 20000)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.39 µs


In [22]:
%%time
#This takes almost 10Seconds - All done here, so not required to do it again
#download_files_from_s3('log_data', 50)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [23]:
#Merging Done - So commenting it out
#merge_song_files('/song_data', 'merged_song_data.json')

In [24]:
song_df = json_to_dataframe('merged_song_data.json')

df shape (14896, 10)


In [25]:
#df1 = pd.read_json('log_data/2018/11/2018-11-02-events.json', lines=True)
#df1.head()

In [26]:
#Merging Done - So commenting it out
#merge_log_files('/log_data', 'merged_log_data.json')

In [27]:
log_df = json_to_dataframe('merged_log_data.json')

df shape (8056, 18)


In [28]:
#Downloading Done - So not doing it again
#s3://udacity-dend/log_json_path.json
#No clue what is this file gonna be used for

#sampleDbBucket.download_file('log_json_path.json', 'log_json_path.json')

## Assess

### song_df dataset

In [17]:
song_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14896 entries, 0 to 14895
Data columns (total 10 columns):
artist_id           14896 non-null object
artist_latitude     5277 non-null float64
artist_location     14895 non-null object
artist_longitude    5277 non-null float64
artist_name         14896 non-null object
duration            14896 non-null float64
num_songs           14896 non-null int64
song_id             14896 non-null object
title               14896 non-null object
year                14896 non-null int64
dtypes: float64(3), int64(2), object(5)
memory usage: 1.1+ MB


In [158]:
song_df.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARTH9041187FB43E1F,,"St. Joseph, MO",,Eminem,312.2673,1,SOLXDDC12A6701FBFD,I'm Back,2000
1,ARQFUGM1187FB3E24E,,"Los Angeles, California, USA",,Black Label Society,258.95138,1,SOHVHKM12A8C13F716,Counterfeit God,2000
2,ARM2L4M1187B9B5F51,,,,FreQ Nasty,205.53098,1,SOFYPCI12A6701FF60,Fresh,2002
3,ARCQMJH1187B9B53CC,30.00875,"Smithville, TX",-97.16001,DJ Screw_ B-Legit_ Lil Keke_ Phaz_ Duke,262.89587,1,SOPEVXO12AB0185938,Screw Mix (feat. B-Legit_ Lil Keke_ Phaz_ Duke),0
4,ARNH6Q11187FB40FDD,40.65507,"Brooklyn, NY",-73.94888,Jeru The Damaja,245.18485,1,SOYZIYS12A6D4F5ED5,Revenge Of The Prophet (Part 5),1996


In [19]:
#Doesn't have latitude & longitude data(2926)
song_df[(song_df.artist_latitude.isnull()) & (song_df.artist_longitude.isnull()) & (song_df.artist_location != '')].head()
#song_df[(song_df.artist_latitude.isnull()) & (song_df.artist_longitude.isnull()) & (song_df.artist_location != '')].shape

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARTH9041187FB43E1F,,"St. Joseph, MO",,Eminem,312.2673,1,SOLXDDC12A6701FBFD,I'm Back,2000
1,ARQFUGM1187FB3E24E,,"Los Angeles, California, USA",,Black Label Society,258.95138,1,SOHVHKM12A8C13F716,Counterfeit God,2000
24,ARJ0AL61187B9A3F27,,Georgia,,Katie Melua,193.51465,1,SOFHFLK12AF72A4FB9,Mary Pickford (Used To Eat Roses),2008
27,ARJ2PMY1187FB5B563,,NY - Long Island,,Burning Spear,334.31465,1,SOVTFUO12A6310D813,The Invasion (Aka Black Wa Da Da),0
31,ARWR6RK1187FB3AB52,,"Long Island, NY",,From Autumn To Ashes,147.64363,1,SOHKXAC12A58A7F6E5,IV,2002


In [38]:
#Around 6693 rows doesn't have location & lats & long
print(song_df[(song_df.artist_location == '') & (song_df.artist_latitude.isnull()) & (song_df.artist_longitude.isnull())].shape)
song_df[(song_df.artist_location == '') & (song_df.artist_latitude.isnull()) & (song_df.artist_longitude.isnull())].head()

(6693, 10)


Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
2,ARM2L4M1187B9B5F51,,,,FreQ Nasty,205.53098,1,SOFYPCI12A6701FF60,Fresh,2002
5,AR1U3BY1187B9B136F,,,,God Lives Underwater,215.40526,1,SOPWHEV12AB017DC10,Don't Know How To Be,1995
6,AR1ILHX1187FB53892,,,,Magtens Korridorer,266.63138,1,SOUPGMR12AB01820DD,Døden Nær,2005
9,AR4K1XY1187B99479D,,,,General Degree,213.68118,1,SOETCUF12AB0182C10,Last Night,0
11,AR1GZB111C8A421C7A,,,,Andre Crom,400.8224,1,SOYXOEY12AB0180988,Warp,0


In [17]:
#Thought one artist_id could have location details and same artist_id didn't have location details. It doesn't look like that
bool_series = song_df.artist_id.duplicated()
song_df[bool_series].sort_values(by=['artist_id']).head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
11984,AR00B1I1187FB433EB,,"Stockholm, Sweden",,Eagle-Eye Cherry,207.15057,1,SONHPYP12A8C1417AE,One Good Reason,2000
5580,AR00FVC1187FB5BE3E,25.67084,"Monterrey, NL, México",-100.30953,Panda,225.82812,1,SOSIUCT12AB0182323,Conversación Casual,0
7301,AR00FVC1187FB5BE3E,25.67084,"Monterrey, NL, México",-100.30953,Panda,194.66404,1,SOIBMQP12AB017FF20,Espejismos Y Visiones,0
2859,AR00Y9I1187B999412,,,,Akercocke,467.59138,1,SOLMAZB12AB017FE90,Leviathan,2003
13904,AR00Y9I1187B999412,,,,Akercocke,256.522,1,SOQQWVQ12A8C132635,Breaking Silence,2001


In [35]:
#None needs to changed to blank
song_df[song_df.artist_location.isnull()]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
11357,AR0JBXL1187FB52810,19.40904,,-99.14977,St. Vincent,307.53914,1,SORMAXQ12A8C139224,Landmines,2007


In [37]:
#only this row had lats & long without location name
song_df[(song_df.artist_location == '') & ( (song_df.artist_longitude.notnull()) | (song_df.artist_latitude.notnull()) )].head(50)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
10229,ARZPWMP12086C175D6,31.1689,,-100.07715,Jannes,184.08444,1,SORUGLY12AB0187CC5,Laat Me Alleen,0


In [66]:
#Geo-coordinates are pointing to Salem, IL
song_df[song_df.artist_location.str.contains('originally', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
12184,ARAH4TU1187FB51D28,38.62666,"New York City (Salem, IL - originally)",-88.94561,Bill Laswell,132.67546,1,SOTTMXT12AF72A1B42,The Gary Selchie - Solas,0
12624,ARAH4TU1187FB51D28,38.62666,"New York City (Salem, IL - originally)",-88.94561,Bill Laswell,258.87302,1,SOHLWHP12A6D4F614B,Don't Lose Control,0
14699,ARAH4TU1187FB51D28,38.62666,"New York City (Salem, IL - originally)",-88.94561,Jah Wobble & Bill Laswell,463.35955,1,SOURIGQ12AB017FCA3,Subcode,0


In [18]:
song_df['num_songs'].value_counts()

1    14896
Name: num_songs, dtype: int64

In [19]:
#Location name is not in consistent format
song_df['artist_location'].value_counts()

                                    6694
London, England                      245
Los Angeles, CA                      217
New York, NY                         183
Chicago, IL                          120
California - LA                      103
NY - New York City                    99
Detroit, MI                           77
Brooklyn, NY                          74
California                            64
Philadelphia, PA                      63
Boston, MA                            63
Seattle, WA                           59
San Francisco, CA                     59
Texas                                 57
Atlanta, GA                           53
Sweden                                50
Glasgow, Scotland                     48
Manchester, England                   48
United States                         47
London                                47
Memphis, TN                           46
Kingston, Jamaica                     46
England                               45
Florida         

In [20]:
# Shows rows where any cell has null values.
hasnull_df = song_df[song_df.isnull().any(1)]
hasnull_df2 = hasnull_df[~( (hasnull_df.artist_location == '') & (hasnull_df.artist_latitude.isnull()) & (hasnull_df.artist_longitude.isnull()) )]
hasnull_df3 = hasnull_df2[~( (hasnull_df2.artist_location == '') | (hasnull_df2.artist_latitude.isnull()) | (hasnull_df2.artist_longitude.isnull()) )]
hasnull_df3

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
11357,AR0JBXL1187FB52810,19.40904,,-99.14977,St. Vincent,307.53914,1,SORMAXQ12A8C139224,Landmines,2007


In [21]:
#No duplicate records
print('All = ',song_df.shape)
bool_series = song_df.duplicated()
song_unique_df = song_df[~bool_series]
print('Unique = ',song_unique_df.shape)

All =  (14896, 10)
Unique =  (14896, 10)


### log_df dataset

In [31]:
log_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8056 entries, 0 to 8055
Data columns (total 18 columns):
artist           6820 non-null object
auth             8056 non-null object
firstName        7770 non-null object
gender           7770 non-null object
itemInSession    8056 non-null int64
lastName         7770 non-null object
length           6820 non-null float64
level            8056 non-null object
location         7770 non-null object
method           8056 non-null object
page             8056 non-null object
registration     7770 non-null float64
sessionId        8056 non-null int64
song             6820 non-null object
status           8056 non-null int64
ts               8056 non-null int64
userAgent        7770 non-null object
userId           8056 non-null object
dtypes: float64(2), int64(4), object(12)
memory usage: 1.1+ MB


In [23]:
log_df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,The Grass Roots,Logged In,Sara,F,72,Johnson,166.71302,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Let's Live For Today,200,1542153802796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
1,Stars,Logged In,Sara,F,73,Johnson,298.94485,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Time Can Never Kill The True Heart,200,1542153968796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
2,Eddie Palmieri,Logged In,Sara,F,74,Johnson,391.83628,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Nada De Ti,200,1542154266796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
3,The Bravery,Logged In,Sara,F,75,Johnson,168.14975,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Give In,200,1542154657796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
4,K.U.K.L,Logged In,Sara,F,76,Johnson,181.28934,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Anna,200,1542154825796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95


In [24]:
log_df.auth.value_counts()

Logged In     7770
Logged Out     286
Name: auth, dtype: int64

In [25]:
#log_df[(log_df.userId == '95') & (log_df.sessionId == 411)].sort_values(by=['itemInSession'])
log_df[(log_df.auth == 'Logged In')].sort_values(by=['itemInSession']).head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
3091,Chris Cornell,Logged In,Aleena,F,0,Kirby,353.69751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541023000000.0,350,Sunshower (Great Expectations Soundtrack),200,1541835258796,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,44
3081,Patrick Jumpen,Logged In,Ryan,M,0,Smith,208.87465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,480,Holiday,200,1541979540796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3083,Hoobastank,Logged In,Cierra,F,0,Finley,241.3971,free,"Richmond, VA",PUT,NextSong,1541013000000.0,132,Say The Same,200,1541808927796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",96
1325,Kid Cudi Vs Crookers,Logged In,Brantley,M,0,West,162.97751,free,"Portland-Vancouver-Hillsboro, OR-WA",PUT,NextSong,1541057000000.0,242,Day 'N' Nite,200,1541407807796,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,28
1323,Powderfinger,Logged In,Harper,M,0,Barrett,276.68853,paid,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1540685000000.0,129,My Happiness,200,1541407366796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",42


In [26]:
#Total Unique Songs
len(log_df.song.unique())

5190

In [27]:
#Status Error
# 200 = OK; 307 = Redirect; 404 = Resource not found
log_df.status.value_counts()

200    7846
307     201
404       9
Name: status, dtype: int64

In [28]:
log_df[log_df.status == 307].head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
77,,Logged In,Lily,F,1,Cooper,,free,"Columbia, SC",PUT,Logout,1541058000000.0,58,,307,1542174900796,"""Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebK...",59.0
80,,Logged Out,,,4,,,free,,PUT,Login,,58,,307,1542174942796,,
100,,Logged In,Rylan,M,2,George,,free,"Birmingham-Hoover, AL",PUT,Submit Upgrade,1541020000000.0,479,,307,1542177314796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",16.0
170,,Logged In,Chloe,F,3,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",PUT,Submit Upgrade,1540941000000.0,568,,307,1542184236796,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,49.0
285,,Logged In,Kate,F,4,Harrell,,paid,"Lansing-East Lansing, MI",PUT,Logout,1540473000000.0,558,,307,1542197300796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",97.0


In [30]:
log_df[log_df.registration.isnull()].head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
78,,Logged Out,,,2,,,free,,GET,Home,,58,,200,1542174903796,,
79,,Logged Out,,,3,,,free,,GET,Home,,58,,200,1542174941796,,
80,,Logged Out,,,4,,,free,,PUT,Login,,58,,307,1542174942796,,
286,,Logged Out,,,5,,,paid,,GET,Home,,558,,200,1542197341796,,
287,,Logged Out,,,6,,,paid,,PUT,Login,,558,,307,1542197342796,,


In [31]:
log_df[log_df.artist.isnull()].head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
10,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,551,,200,1542156103796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
15,,Logged In,Sara,F,85,Johnson,,paid,"Winston-Salem, NC",GET,Home,1540809000000.0,411,,200,1542156862796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
16,,Logged In,Sara,F,86,Johnson,,paid,"Winston-Salem, NC",GET,About,1540809000000.0,411,,200,1542157048796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
17,,Logged In,Sara,F,87,Johnson,,paid,"Winston-Salem, NC",GET,Home,1540809000000.0,411,,200,1542157182796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
18,,Logged In,Mohammad,M,0,Rodriguez,,free,"Sacramento--Roseville--Arden-Arcade, CA",GET,Home,1540512000000.0,539,,200,1542162914796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",88


### Quality  
1. log_df : Only page='NextSong' is required(not filtering in dataframe)  
2. log_df : Delete rows which has userId as null/blank(not filtering in dataframe)  
3. song_df: Around 6693 rows doesn't have location & lats & long.(No Action)  
4. song_df: Around 4762 rows have Year has 0 (No Action)  
5. song_df: Format of location is not consistent. London, England/Texas/California - LA  
6. song_df: Around 2926 rows has just the location name doesn't have latitude, longitude(handled in tidiness)  
7. song_df: artist_location has None & Blank (change none to blank) and other location corrections will be handled
8. log_df : When auth='Logged Out' it doesn't have userId. But when 'Logged In', userId is captured. Due to this quality issue, you cannot say exactly how long a user id logged in. You can just make a guess by listing the longs.(No Action)  

### Tidiness  
1. log_df : convert ts from integer to timestamp  
2. song_df: column num_songs can be removed as it has only one value ( 1 ) (No Action in dataframe)  
3. log_df : convert registration column to int(has null convert to 0 then int)  
4. song_df: Add columns : county, city, state, country  
5. song_df: Round duration column(value is seconds, it doesn't have to be in float)

## Clean

In [32]:
song_df_copy = song_df.copy()
log_df_copy = log_df.copy()

In [30]:
print('Total rows in song_df_copy = {}'.format(song_df_copy.shape[0]))
print('Total rows in log_df_copy = {}'.format(log_df_copy.shape[0]))

Total rows in song_df_copy = 14896
Total rows in log_df_copy = 8056


### Issue 1 : Location corrections
### Define
song_df: artist_location has None & Blank (change none to blank)
### Code

In [91]:
song_df_copy[song_df_copy.artist_location.isnull()]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
11357,AR0JBXL1187FB52810,19.40904,,-99.14977,St. Vincent,307.53914,1,SORMAXQ12A8C139224,Landmines,2007


In [92]:
song_df_copy.artist_location = song_df_copy.artist_location.replace(np.nan, '')

### Test

In [93]:
song_df_copy[song_df_copy.artist_location.isnull()]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


### Define
Artist Location = New York City (Salem, IL - originally) and geo-coordinates are pointing to Salem, IL not New York City. 

In [94]:
song_df_copy[song_df_copy.artist_location.str.contains('originally', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
12184,ARAH4TU1187FB51D28,38.62666,"New York City (Salem, IL - originally)",-88.94561,Bill Laswell,132.67546,1,SOTTMXT12AF72A1B42,The Gary Selchie - Solas,0
12624,ARAH4TU1187FB51D28,38.62666,"New York City (Salem, IL - originally)",-88.94561,Bill Laswell,258.87302,1,SOHLWHP12A6D4F614B,Don't Lose Control,0
14699,ARAH4TU1187FB51D28,38.62666,"New York City (Salem, IL - originally)",-88.94561,Jah Wobble & Bill Laswell,463.35955,1,SOURIGQ12AB017FCA3,Subcode,0


In [95]:
song_df_copy[song_df_copy.artist_location.str.contains('Belgica -- Namur, Namur/Ghent, East Flanders', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
1110,ARZEWUR1187FB53DC8,50.45663,"Belgica -- Namur, Namur/Ghent, East Flanders",4.87137,Enthroned,240.74404,1,SOXCWTT12AC9075756,The Vitalized Shell,2010


In [96]:
song_df_copy[song_df_copy.artist_location.str.contains('D�orf, Germany', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
251,ARI9UV41187B98AB58,,"D�orf, Germany",,Kreidler,128.54812,1,SOYWKMX12A6D4FA6A0,Hillwood,1996
1257,ARQFRTG1187FB3F003,,"D�orf, Germany",,Waldo's People,197.58975,1,SOMKOPP12A8C13BD34,U Drive Me Crazy,1998
6028,AR1ZKBE1187FB53629,,"D�orf, Germany",,Kraftwerk,272.03873,1,SOXOUJH12A6D4FC39B,Pocket Calculator (2009 Digital Remaster),0
7095,AROZOP31187FB5A969,50.7266,"D�orf, Germany",6.51747,Propaganda,395.31057,1,SOPZREU12A58A79EAB,Das Testament Des Dr Mabuse,0
8746,AR1ZKBE1187FB53629,,"D�orf, Germany",,Kraftwerk,201.09016,1,SOXVRHB12A8C13F14B,Titanium (2009 Digital Remaster),0
12218,AR1ZKBE1187FB53629,,"D�orf, Germany",,Kraftwerk,399.33342,1,SORWDCM12A6D4FC397,Computer Love (2009 Digital Remaster),0
12585,AR1ZKBE1187FB53629,,"D�orf, Germany",,Kraftwerk,54.69995,1,SOPJCER12A6D4FCFCE,The Voice Of Energy (2009 Digital Remaster),0
12830,ARHJMEF1187FB415AE,,"D�orf, Germany",,Fehlfarben,291.13424,1,SOJXNPU12A8C13170B,Magnificent Obsession,1983
12902,ARD3OO31187FB4F937,50.7266,"D�orf, Germany",6.51747,Mouse On Mars,316.76036,1,SOKUJRJ12A6D4FA026,First : Break,2001
13197,AROZOP31187FB5A969,50.7266,"D�orf, Germany",6.51747,Propaganda,311.2224,1,SORQPVK12A6D4F7F78,Heaven Give Me Words,0


In [97]:
song_df_copy[song_df_copy.artist_location.str.contains('�gelholm, Sweden', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
830,ARNUU2R1187B9A316C,62.19845,"�gelholm, Sweden",17.55142,Sounds Like Violence,266.4224,1,SOJIUNI12A6D4FCA34,The Greatest,2007


In [98]:
song_df_copy[song_df_copy.artist_location.str.contains('Dubioza kolektiv', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
461,ARTHJGQ1187FB42F1F,43.8586,Dubioza kolektiv (aka Dubioza as referred to b...,18.4295,Dubioza kolektiv,318.85016,1,SOLPTHV12AC9073057,Marijuana,2006


In [99]:
song_df_copy[song_df_copy.artist_location.str.contains('href', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
5785,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,287.92118,1,SOVYXYL12AF72A3373,Rebel Yell (1999 Digital Remaster),1983
7451,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,247.53587,1,SOLQYSZ12AB0181F97,Mony Mony (Live),1987
9049,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,443.14077,1,SOIEXLS12A6D4F792F,Shangrila,1993
10749,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,237.06077,1,SOETDBF12A81C20BC0,Body Snatcher,2005
11505,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,233.22077,1,SOVIYJY12AF72A4B00,The Dead Next Door (Digitally Remastered 99),1983


In [100]:
#artist_location has integer values
for index, row in song_df_copy.iterrows():
    if(type(row['artist_location']) == int):
        print(row.values)

['ARPPJJO1187B99D171' 25.03512 100 121.5152 'Michael Stearns' 427.59791 1
 'SOEBWIE12A8C14380F' 'Sacred Site Soundtrack' 1993]
['ARBZIN01187FB362CC' 1.32026 27 103.78871 'Paris Hilton' 192.28689 1
 'SOERIDA12A6D4F8506' 'I Want You (Album Version)' 2006]
['ARC0IOF1187FB3F6E6' nan 108 nan 'Huey Lewis & The News' 286.4322 1
 'SOHDWWH12A6D4F7F6A' 'I Want A New Drug' 1983]
['ARZ4DLQ1187B9B9106' nan 99 nan 'Frankie Bones' 196.0224 1
 'SOPNQYA12AB018A04E' 'Circus Plate #1' 2005]
['ARC0IOF1187FB3F6E6' nan 108 nan 'Huey Lewis And The News' 216.11057 1
 'SOTIFMS12A5891F8D1' "It's Alright (LP Version)" 2005]
['ARC0IOF1187FB3F6E6' nan 108 nan 'Huey Lewis And The News' 345.96526 1
 'SOHVMFI12A8C13C63E' 'Thank You #19 (LP Version)' 2005]
['ARBZIN01187FB362CC' 1.32026 27 103.78871 'Paris Hilton' 274.05016 1
 'SOYEDMV12A58A7AF6D' "Do Ya Think I'm Sexy (Album Version)" 2006]
['ARV36QK1187FB52260' nan 104 nan 'Charlie Hunter Trio' 278.83057 1
 'SOOIAVT12A8C13B7F9' 'Drop A Dime' 2007]
['ARZ8MVL1187FB41D1

In [101]:
for index, row in song_df_copy.iterrows():
    if(type(row['artist_location']) == int):
        song_df_copy.at[index,'artist_location']=''

In [102]:
#artist_location has href links
for index, row in song_df_copy.iterrows():
    if('href' in row.artist_location):
        print(row['artist_location'])

<a href="http://billyidol.net" onmousedown='UntrustedLink.bootstrap($(this), "fc44f8f60d13ab68c56b3c6709c6d670", event)' target="_blank" rel="nofollow">http://billyidol.net</a>
<a href="http://billyidol.net" onmousedown='UntrustedLink.bootstrap($(this), "fc44f8f60d13ab68c56b3c6709c6d670", event)' target="_blank" rel="nofollow">http://billyidol.net</a>
<a href="http://billyidol.net" onmousedown='UntrustedLink.bootstrap($(this), "fc44f8f60d13ab68c56b3c6709c6d670", event)' target="_blank" rel="nofollow">http://billyidol.net</a>
<a href="http://billyidol.net" onmousedown='UntrustedLink.bootstrap($(this), "fc44f8f60d13ab68c56b3c6709c6d670", event)' target="_blank" rel="nofollow">http://billyidol.net</a>
<a href="http://billyidol.net" onmousedown='UntrustedLink.bootstrap($(this), "fc44f8f60d13ab68c56b3c6709c6d670", event)' target="_blank" rel="nofollow">http://billyidol.net</a>


In [103]:
song_df_copy[song_df_copy.artist_location.str.contains('DA BEAN', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
5161,ARAQMES1187FB4D46A,28.64334,"I AM DA BEAN!!! YIMEAN!!! 3S4LIF3!!!!, Florida",-81.23258,Benzino / The Outlawz,228.96281,1,SOHJBCO12AB01889CD,Feel Your Pain,0
11435,ARAQMES1187FB4D46A,28.64334,"I AM DA BEAN!!! YIMEAN!!! 3S4LIF3!!!!, Florida",-81.23258,Benzino / K.T. / Prince / Smoke / Luv' / Weird...,260.8322,1,SOSHXLZ12AB0185894,Throw Them 3's (Bouston Ni****),0


In [104]:
song_df_copy[song_df_copy.artist_location.str.contains('YAHD!', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
1725,ARZLVVQ11F4C8421BA,39.49974,YAHD!,-111.54732,T.O.K,188.1073,1,SOTQSHS12A8AE46A4A,Money Maker,0
11363,ARFH6FT1187FB5BF47,39.49974,YAHD!,-111.54732,T.O.K.,225.72363,1,SOFLPJR12A8C134166,Saturday,2001


In [105]:
song_df_copy[song_df_copy.artist_location.str.contains('Superdupont', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
10725,AR107SB1187B9A79F9,,"Bézu, comme Superdupont, ne connait qu&#039;un...",,Bézu,184.0322,1,SOQJWZI12A8C140181,La bite du plombier,0


In [106]:
song_df_copy[song_df_copy.artist_location.str.contains('&', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
26,ARX2R5A1187FB5B85B,33.74831,"Joliet, IL &amp; Atlanta, GA",-84.39111,Ann Nesby,243.01669,1,SOUEARP12A58A7D643,Tables Turn,0
33,AR6TW8G119B8668BA1,54.50292,"Derry &amp; Dungannon, N.Ireland",-6.76687,The Japanese Popstars,235.83302,1,SODJQMR12A6D4FD321,Sample Whore,2008
5780,ART4QZC1187FB51612,,Los Angeles &amp; New York,,Janet Jackson,67.10812,1,SOBKMDD12A58A7F476,Interlude: Livin'...In Complete Darkness,0
10236,ARNHJW31187B9A2113,38.04859,"Lexington, KY &amp; Atlanta, GA",-84.50032,Donnie,310.96118,1,SOJZGKZ12A6D4F84E8,Cloud 9,2002
10725,AR107SB1187B9A79F9,,"Bézu, comme Superdupont, ne connait qu&#039;un...",,Bézu,184.0322,1,SOQJWZI12A8C140181,La bite du plombier,0
12426,AR548LF1187FB4ADDF,42.50382,Italy &amp; London,12.57347,Italoboyz,477.83138,1,SOAHYWV12A58A7AA0C,L'Anagramme,0
13842,AR72HPG1187B9A1496,51.84005,"HERTS,SG, Producer, song writter & rapper, UK",-0.2751,The Mission,229.98159,1,SOCKMDZ12A6D4F6A79,Hands Across The Ocean,1990


In [107]:
# 1)convert &amp; to & 2)Pick one of two locations(2 from array)
for index, row in song_df_copy.iterrows():
    if('&' in row.artist_location):
        print(row['artist_location'],'|', html.unescape(row.artist_location).split('&')[-1])

Joliet, IL &amp; Atlanta, GA |  Atlanta, GA
Derry &amp; Dungannon, N.Ireland |  Dungannon, N.Ireland
Los Angeles &amp; New York |  New York
Lexington, KY &amp; Atlanta, GA |  Atlanta, GA
Bézu, comme Superdupont, ne connait qu&#039;un pays : la France ! | Bézu, comme Superdupont, ne connait qu'un pays : la France !
Italy &amp; London |  London
HERTS,SG, Producer, song writter & rapper, UK |  rapper, UK


In [108]:
song_df_copy[song_df_copy.artist_location.str.contains('born', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
10825,ARQF61A1187FB4A61F,59.33217,"born 19 July 1976 (age 33) in Täby, Stockholm,...",18.06243,Eric Prydz,360.46322,1,SOYZIFC12AB0186300,Pjanoo (High Contrast Remix),2008


In [109]:
song_df_copy[song_df_copy.artist_location.str.contains('slightly', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
7667,AR7WSQE1187FB4CF08,,"Copenhagen, Denmark (With a slightly touch of ...",,Aqua,239.3073,1,SOOLCUY12A6D4F9119,Goodbye To The Circus,0


In [110]:
song_df_copy[song_df_copy.artist_location.str.contains('Rossville, Tennessee', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
7405,AR06M3F1187FB38BED,34.51042,"Rossville, Tennessee (Lived in Como, Mississippi)",-89.94049,Mississippi Fred McDowell,226.37669,1,SOWKEYU12A8C13C384,You Gotta Move,1999


In [111]:
song_df_copy.artist_location = song_df_copy.artist_location.replace('New York City (Salem, IL - originally)','Salem, IL')
song_df_copy.artist_location = song_df_copy.artist_location.replace('Belgica -- Namur, Namur/Ghent, East Flanders','Namur, Namur')
song_df_copy.artist_location = song_df_copy.artist_location.replace('D�orf, Germany','Dorf, Germany')
song_df_copy.artist_location = song_df_copy.artist_location.replace('�gelholm, Sweden','angelholm, Sweden')
song_df_copy.artist_location = song_df_copy.artist_location.replace('Dubioza kolektiv (aka Dubioza as referred to by some fans) is a Bosnian band from Sarajevo','Sarajevo')
song_df_copy.artist_location = song_df_copy.artist_location.replace('YAHD!','')

ix = int(song_df_copy[song_df_copy.artist_location.str.contains('Superdupont', na=False)].index.values)
song_df_copy.at[ix,'artist_location']='France'

#song_df_copy.artist_location = song_df_copy.artist_location.replace("&",'')
#Replace integer value with string
#song_df_copy['artist_location'] = song_df_copy['artist_location'].map({27:'', 100:''}) 

In [112]:
for index, row in song_df_copy.iterrows():
    if(type(row['artist_location']) == int):
        song_df_copy.at[index,'artist_location']=''
        
    if('href' in row.artist_location):
        song_df_copy.at[index,'artist_location']=''
    if('DA BEAN' in row.artist_location):
        song_df_copy.at[index,'artist_location']=''        
    if('Superdupont' in row.artist_location):
        song_df_copy.at[index,'artist_location']=''     
    if('&' in row.artist_location):
        song_df_copy.at[index,'artist_location']=html.unescape(row.artist_location).split('&')[-1]
        
    if('born' in row.artist_location):
        song_df_copy.at[index,'artist_location']=''             
    if('slightly' in row.artist_location):
        song_df_copy.at[index,'artist_location']='Copenhagen, Denmark'
    if('Rossville, Tennessee' in row.artist_location):
        song_df_copy.at[index,'artist_location']='Rossville, Tennessee'


In [113]:
#print(''.join(row['artist_location']))
for index, row in song_df_copy.iterrows():
    if(type(row['artist_location']) == tuple):
        print(row.values)

In [114]:
#print(''.join(row['artist_location']))
for index, row in song_df_copy.iterrows():
    if(type(row['artist_location']) == tuple):
        song_df_copy.at[index,'artist_location']=''.join(row['artist_location'])                

### Test

In [115]:
#print(''.join(row['artist_location']))
for index, row in song_df_copy.iterrows():    
    if(type(row['artist_location']) == tuple):
        print(row.values)

In [116]:
song_df_copy[song_df_copy.artist_location.str.contains('originally', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


In [117]:
song_df_copy[song_df_copy.artist_location.str.contains('Belgica -- Namur, Namur/Ghent, East Flanders', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


In [118]:
song_df_copy[song_df_copy.artist_location.str.contains('D�orf, Germany', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


In [119]:
song_df_copy[song_df_copy.artist_location.str.contains('�gelholm, Sweden', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


In [120]:
song_df_copy[song_df_copy.artist_location.str.contains('Dubioza kolektiv', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


In [121]:
#artist_location has integer values
for index, row in song_df_copy.iterrows():
    if(type(row['artist_location']) == int):
        print(row.values)

In [122]:
#artist_location has href links
for index, row in song_df_copy.iterrows():
    if('href' in row.artist_location):
        print(row['artist_location'])

In [123]:
song_df_copy[song_df_copy.artist_location.str.contains('Superdupont', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


In [124]:
song_df_copy[song_df_copy.artist_location.str.contains('&', na=False)]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year


### Issue 2 : Convert ts(integer) to timestamp
### Define
log_df : convert ts from integer to timestamp  
### Code

In [125]:
log_df_copy.head(2)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,The Grass Roots,Logged In,Sara,F,72,Johnson,166.71302,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Let's Live For Today,200,1542153802796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
1,Stars,Logged In,Sara,F,73,Johnson,298.94485,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Time Can Never Kill The True Heart,200,1542153968796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95


In [126]:
log_df_copy['ts'] = pd.to_datetime(log_df_copy['ts'], unit='ms')

### Test

In [127]:
log_df_copy.head(2)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,The Grass Roots,Logged In,Sara,F,72,Johnson,166.71302,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Let's Live For Today,200,2018-11-14 00:03:22.796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
1,Stars,Logged In,Sara,F,73,Johnson,298.94485,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,411,Time Can Never Kill The True Heart,200,2018-11-14 00:06:08.796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95


In [128]:
log_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8056 entries, 0 to 8055
Data columns (total 18 columns):
artist           6820 non-null object
auth             8056 non-null object
firstName        7770 non-null object
gender           7770 non-null object
itemInSession    8056 non-null int64
lastName         7770 non-null object
length           6820 non-null float64
level            8056 non-null object
location         7770 non-null object
method           8056 non-null object
page             8056 non-null object
registration     7770 non-null float64
sessionId        8056 non-null int64
song             6820 non-null object
status           8056 non-null int64
ts               8056 non-null datetime64[ns]
userAgent        7770 non-null object
userId           8056 non-null object
dtypes: datetime64[ns](1), float64(2), int64(3), object(12)
memory usage: 1.1+ MB


### Issue 3 : Convert registration column to int
### Define
log_df : convert registration column to int(has null convert to 0 then int)  
### Code

In [129]:
log_df_copy[log_df_copy.registration.isnull()].head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
78,,Logged Out,,,2,,,free,,GET,Home,,58,,200,2018-11-14 05:55:03.796,,
79,,Logged Out,,,3,,,free,,GET,Home,,58,,200,2018-11-14 05:55:41.796,,
80,,Logged Out,,,4,,,free,,PUT,Login,,58,,307,2018-11-14 05:55:42.796,,
286,,Logged Out,,,5,,,paid,,GET,Home,,558,,200,2018-11-14 12:09:01.796,,
287,,Logged Out,,,6,,,paid,,PUT,Login,,558,,307,2018-11-14 12:09:02.796,,


In [130]:
log_df_copy.registration = log_df_copy.registration.replace(np.nan, 0)
log_df_copy['registration'] = log_df_copy.registration.astype(int)

### Test

In [131]:
log_df_copy[log_df_copy.registration.isnull()].head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId


In [34]:
log_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8056 entries, 0 to 8055
Data columns (total 18 columns):
artist           6820 non-null object
auth             8056 non-null object
firstName        7770 non-null object
gender           7770 non-null object
itemInSession    8056 non-null int64
lastName         7770 non-null object
length           6820 non-null float64
level            8056 non-null object
location         7770 non-null object
method           8056 non-null object
page             8056 non-null object
registration     7770 non-null float64
sessionId        8056 non-null int64
song             6820 non-null object
status           8056 non-null int64
ts               8056 non-null int64
userAgent        7770 non-null object
userId           8056 non-null object
dtypes: float64(2), int64(4), object(12)
memory usage: 1.1+ MB


### Issue 4 : Round duration column
### Define
song_df: Round duration column(value is seconds, it doesn't have to be in float)

### Code

In [133]:
song_df_copy['duration'] = song_df_copy['duration'].astype('int64')

### Test

In [134]:
song_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14896 entries, 0 to 14895
Data columns (total 10 columns):
artist_id           14896 non-null object
artist_latitude     5277 non-null float64
artist_location     14896 non-null object
artist_longitude    5277 non-null float64
artist_name         14896 non-null object
duration            14896 non-null int64
num_songs           14896 non-null int64
song_id             14896 non-null object
title               14896 non-null object
year                14896 non-null int64
dtypes: float64(2), int64(3), object(5)
memory usage: 1.1+ MB


In [135]:
song_df_copy.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARTH9041187FB43E1F,,"St. Joseph, MO",,Eminem,312,1,SOLXDDC12A6701FBFD,I'm Back,2000
1,ARQFUGM1187FB3E24E,,"Los Angeles, California, USA",,Black Label Society,258,1,SOHVHKM12A8C13F716,Counterfeit God,2000
2,ARM2L4M1187B9B5F51,,,,FreQ Nasty,205,1,SOFYPCI12A6701FF60,Fresh,2002
3,ARCQMJH1187B9B53CC,30.00875,"Smithville, TX",-97.16001,DJ Screw_ B-Legit_ Lil Keke_ Phaz_ Duke,262,1,SOPEVXO12AB0185938,Screw Mix (feat. B-Legit_ Lil Keke_ Phaz_ Duke),0
4,ARNH6Q11187FB40FDD,40.65507,"Brooklyn, NY",-73.94888,Jeru The Damaja,245,1,SOYZIYS12A6D4F5ED5,Revenge Of The Prophet (Part 5),1996


### Issue 5 : Add columns county, city, state, country, country_code to song_df
### Define
a. Fix some artist_location names  
b. Add columns county, city, state, country, country_code

### Code

In [136]:
print('All = ',song_df_copy.shape[0])
print('Locations = ',song_df_copy[(song_df_copy.artist_location != '')].shape[0])
print('Blank = ',song_df_copy[song_df_copy.artist_location.str.strip() == ''].shape[0])
print('LatLon = ',song_df_copy[(song_df_copy.artist_location.str.strip() == '') & ( (song_df_copy.artist_longitude.notnull()) & (song_df_copy.artist_latitude.notnull()) )].shape[0])

All =  14896
Locations =  8179
Blank =  6725
LatLon =  18


In [137]:
#song_df with locations
song_df_wlocations = song_df_copy[(song_df_copy.artist_location.str.strip() != '')]
song_df_wblank = song_df_copy[(song_df_copy.artist_location.str.strip() == '')]
song_df_latlon = song_df_copy[(song_df_copy.artist_location.str.strip() == '') & ( (song_df_copy.artist_longitude.notnull()) & (song_df_copy.artist_latitude.notnull()) )]

#Getting the true blanks(no locations & no coordinates)
song_df_tblank = pd.merge(song_df_wblank,song_df_latlon, how='outer', indicator=True)
song_df_wblank = song_df_tblank[song_df_tblank._merge =='left_only'].copy()
#song_df_wblank.drop(['_merge'], axis=1)
print(song_df_wblank.shape)

#song_df_wlocations.reset_index(drop=True)
#song_df_wblank.reset_index(drop=True)
#song_df_latlon.reset_index(drop=True)

print('All = ',song_df_copy.shape[0])
print('Locations = ',song_df_wlocations.shape[0])
print('Blank = ',song_df_wblank.shape[0])
print('LatLon= ',song_df_latlon.shape[0])

(6707, 11)
All =  14896
Locations =  8171
Blank =  6707
LatLon=  18


In [138]:
song_df_wblank.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge
0,ARM2L4M1187B9B5F51,,,,FreQ Nasty,205,1,SOFYPCI12A6701FF60,Fresh,2002,left_only
1,AR1U3BY1187B9B136F,,,,God Lives Underwater,215,1,SOPWHEV12AB017DC10,Don't Know How To Be,1995,left_only
2,AR1ILHX1187FB53892,,,,Magtens Korridorer,266,1,SOUPGMR12AB01820DD,Døden Nær,2005,left_only
3,AR4K1XY1187B99479D,,,,General Degree,213,1,SOETCUF12AB0182C10,Last Night,0,left_only
4,AR1GZB111C8A421C7A,,,,Andre Crom,400,1,SOYXOEY12AB0180988,Warp,0,left_only


In [139]:
song_df_wlocations_artist = song_df_wlocations["artist_name"].tolist()
song_df_wblank_artist = song_df_wblank['artist_name'].tolist()
print(len(song_df_wlocations_artist))
print(len(song_df_wblank_artist))

artist_found = []
for s in song_df_wblank_artist:
    if(s in song_df_wlocations_artist):
        #print(s)
        artist_found.append(s)
        
        
#song_df_wlocations[song_df_wlocations.artist_name =='Billy Idol']

8171
6707


In [140]:
print('song_df_wblank = ', song_df_wblank.shape)

song_df_wblank =  (6707, 11)


In [141]:
#Its found certain known artists locations(song_df_wlocations) are not filled in song_df_wblank. 
#updating song_df_blank with location details
index_list = []
count = 0
song_df_new_rows = pd.DataFrame()

for index, row in song_df_wblank.iterrows():
    #    print(index)
    artist_name = row['artist_name']
    artist_location = ''
    lrow = song_df_wlocations[song_df_wlocations.artist_name == artist_name]
    if(lrow.shape[0] != 0):
        count+=1
        #print(lrow.shape)
        artist_location = lrow.iloc[0]['artist_location']
        song_df_wblank.at[index,'artist_location']=artist_location
        index_list.append(index)
        song_df_new_rows = song_df_new_rows.append(lrow)
        
        #print(index, ' - ', artist_location)
        
    #a = round(song_df_latlon[song_df_latlon.artist_name =='Billy Idol'].iloc[0]['artist_latitude'], 4)
    
print('Count = ', count)
print(index_list)

Count =  58
[79, 97, 125, 399, 439, 487, 510, 528, 639, 1061, 1206, 1416, 1564, 1609, 1719, 1776, 2023, 2118, 2204, 2359, 2841, 2973, 2993, 3006, 3120, 3317, 3514, 3537, 3729, 3748, 3841, 4058, 4081, 4332, 4442, 4457, 4527, 4531, 4657, 4719, 4907, 5091, 5166, 5224, 5246, 5254, 5331, 5427, 5484, 5571, 5682, 5924, 6181, 6182, 6233, 6384, 6621, 6622]


In [142]:
#song_df_new_rows.reset_index(drop=True)
print(song_df_new_rows.shape)
print(len(index_list))
print(len(artist_found))
#song_df_new_rows[song_df_new_rows.artist_location.str.strip() == '']

(77, 10)
58
58


In [143]:
print(song_df_wlocations.shape)
print(song_df_new_rows.shape)

(8171, 10)
(77, 10)


In [144]:
#Moving artist_location updated rows from song_df_wblank to song_df_wlocations
print('Before song_df_wblank = ',song_df_wblank.shape)
print('Before song_df_wlocations = ',song_df_wlocations.shape)
song_df_new = song_df_wblank[song_df_wblank.artist_location.str.strip() !=''].copy()
song_df_wblank = song_df_wblank[song_df_wblank.artist_location.str.strip() ==''].copy()

#Commenting below due to ValueError: cannot reindex from a duplicate axis
#song_df_new = song_df_new[['artist_id', 'artist_latitude', 'artist_location', 'artist_latitude', 'artist_name', 'duration', 'num_songs', 'song_id', 'title', 'year']]
#song_df_wlocations = song_df_wlocations[['artist_id', 'artist_latitude', 'artist_location', 'artist_latitude', 'artist_name', 'duration', 'num_songs', 'song_id', 'title', 'year']]

song_df_wlocations = song_df_wlocations.append(song_df_new, sort=False)
#song_df_wlocations = pd.concat([song_df_wlocations,song_df_new], ignore_index=True)
print('After song_df_wblank = ',song_df_wblank.shape)
print('After song_df_wlocations = ',song_df_wlocations.shape)

Before song_df_wblank =  (6707, 11)
Before song_df_wlocations =  (8171, 10)
After song_df_wblank =  (6649, 11)
After song_df_wlocations =  (8229, 11)


In [145]:
#Write song_df_wblank to CSV
song_df_wblank.to_csv('song_df_wblank.csv', encoding='utf-8', index=False)

In [146]:
#Moving rows with lats & longs to song_df_latlon
print('Before song_df_latlon = ',song_df_latlon.shape)
print('Before song_df_wlocations = ',song_df_wlocations.shape)
temp = song_df_wlocations[song_df_wlocations.artist_latitude.notnull()].copy()
song_df_wlocations = song_df_wlocations[song_df_wlocations.artist_latitude.isnull()].copy()
song_df_latlon = song_df_latlon.append(temp, sort=False)
print('After song_df_latlon = ',song_df_latlon.shape)
print('After song_df_wlocations = ',song_df_wlocations.shape)

Before song_df_latlon =  (18, 10)
Before song_df_wlocations =  (8229, 11)
After song_df_latlon =  (5277, 11)
After song_df_wlocations =  (2970, 11)


In [73]:
#Adding extra columns to song_df_latlon for processing.
'''
song_df_latlon_copy = song_df_latlon.copy()

song_df_latlon_copy['process'] = np.NaN
song_df_latlon_copy['process'] = song_df_latlon_copy.process.astype(str)
song_df_latlon_copy['corrected_location'] = np.NaN
song_df_latlon_copy['corrected_location'] = song_df_latlon_copy.corrected_location.astype(str)
song_df_latlon_copy['county'] = np.NaN
song_df_latlon_copy['county'] = song_df_latlon_copy.county.astype(str)
song_df_latlon_copy['city'] = np.NaN
song_df_latlon_copy['city'] = song_df_latlon_copy.city.astype(str)
song_df_latlon_copy['state'] = np.NaN
song_df_latlon_copy['state'] = song_df_latlon_copy.state.astype(str)
song_df_latlon_copy['country'] = np.NaN
song_df_latlon_copy['country'] = song_df_latlon_copy.country.astype(str)
song_df_latlon_copy['country_code'] = np.NaN
song_df_latlon_copy['country_code'] = song_df_latlon_copy.country_code.astype(str)
'''
#Don't uncomment will override the existing worked up file
#song_df_latlon_copy.to_csv('song_df_latlon.csv', encoding='utf-8', index=False)

### Query geopy and get data for latitude & longitude by running GatherLocationDetails.ipynb

In [147]:
song_df_latlon = pd.read_csv("song_df_latlon.csv")
print('song_df_latlon.csv = ',song_df_latlon.shape)

song_df_latlon.csv =  (5277, 18)


In [74]:
#Geo Coordinates are found to be wrong for these countries as per geopy.
#Verified (16.855760, -69.468540) and seems to be middle of nowhere, ocean between Jamica & British Virgin Islands
song_df_latlon[song_df_latlon.process.isnull()]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
761,ARD2GXE1187B9A2E26,16.85576,"Guyana, West Indies",-69.46854,Mad Professor,266,1,SOUIPHX12A6D4F8AB0,Schizophrenic Dub,1986,,,,,,,,
3770,AR40FJJ1187FB3C72C,16.85576,West Indies,-69.46854,Osibisa,233,1,SOTENWL12A6D4FCAED,Abele,1995,,,,,,,,
5167,AR40FJJ1187FB3C72C,16.85576,West Indies,-69.46854,Osibisa,320,1,SOSLAOP12A6D4FCAE2,Get Up,1995,,,,,,,,
5252,ARMDBW21187FB37126,16.85576,"Barbados, West Indies",-69.46854,Grandmaster Flash,197,1,SOAYSDV12AB0186EE0,The King,0,,,,,,,,


In [148]:
print('Total = {}'.format(song_df_latlon.shape))
print('Null  = {}'.format(song_df_latlon[song_df_latlon.process.isnull()].shape))
print('Values= {}'.format(song_df_latlon[song_df_latlon.process.notnull()].shape))

Total = (5277, 18)
Null  = (4, 18)
Values= (5273, 18)


In [149]:
#Moving rows from song_df_latlon to song_df_wlocations due to wrong geo-coordinates
print('Before song_df_latlon = ',song_df_latlon.shape)
print('Before song_df_wlocations = ',song_df_wlocations.shape)

song_df_wlocations_copy = song_df_wlocations.copy()
temp = song_df_latlon[song_df_latlon.process.isnull()].copy()
song_df_latlon = song_df_latlon[song_df_latlon.process.notnull()].copy()
song_df_wlocations = song_df_wlocations.append(temp, sort=False)

print('After song_df_latlon = ',song_df_latlon.shape)
print('After song_df_wlocations = ',song_df_wlocations.shape)

Before song_df_latlon =  (5277, 18)
Before song_df_wlocations =  (2970, 11)
After song_df_latlon =  (5273, 18)
After song_df_wlocations =  (2974, 18)


In [77]:
def correct_spelling(dictSpell, istr):
    outArray = [];
    location = 'Jping, Sweden'
    #print('Input = ',istr)
    array = istr.split(',')
    #print('Array = ',array)
    output = ''
    for s in array:
        s = s.strip()
        #print(s)
        if(spelling_corrections.get(s) != None):
            if(len(spelling_corrections[s].strip()) != 0):
                outArray.append(spelling_corrections[s])
        else:
            outArray.append(s)
        #print('Val = ',output)

    #print('outArray = ', ", ".join(outArray))
    output = ", ".join(outArray)    
    #print('Output = ',output)    
    return output

In [78]:
spelling_corrections = {
    'Jping' : 'Koping'
    ,'Trollhan' : 'Trollhattan'
    ,'Montr' : 'Montreal'
    ,'Saverene' : 'Saverne'
    ,'Brookyln' : 'Brooklyn'
    ,'Halifaz' : 'Halifax'
    ,'Massachusettes' : 'Massachusetts'
    ,'Bundundu' : 'Bandundu'
    ,'Upstate NY' : ''
}
#print(spelling_corrections['Jpingg'])
print(spelling_corrections.get('Jpingg'))
location = 'Jping, Sweden'
print('correct_spelling = ',correct_spelling(spelling_corrections, location))
location = 'Trollhan, Sweden'
print('correct_spelling = ',correct_spelling(spelling_corrections, location))
location = 'Vancouver, Canada'
print('correct_spelling = ',correct_spelling(spelling_corrections, location))


print('http' in 'http://www.sneakerpimps.net')

None
correct_spelling =  Koping, Sweden
correct_spelling =  Trollhattan, Sweden
correct_spelling =  Vancouver, Canada
True


In [79]:
#Adding extra columns to song_df_wlocations for processing.
song_df_wlocations_copy = song_df_wlocations.copy()

song_df_wlocations_copy['process'] = np.NaN
song_df_wlocations_copy['process'] = song_df_wlocations_copy.process.astype(str)
song_df_wlocations_copy['corrected_location'] = np.NaN
song_df_wlocations_copy['corrected_location'] = song_df_wlocations_copy.corrected_location.astype(str)
song_df_wlocations_copy['county'] = np.NaN
song_df_wlocations_copy['county'] = song_df_wlocations_copy.county.astype(str)
song_df_wlocations_copy['city'] = np.NaN
song_df_wlocations_copy['city'] = song_df_wlocations_copy.city.astype(str)
song_df_wlocations_copy['state'] = np.NaN
song_df_wlocations_copy['state'] = song_df_wlocations_copy.state.astype(str)
song_df_wlocations_copy['country'] = np.NaN
song_df_wlocations_copy['country'] = song_df_wlocations_copy.country.astype(str)
song_df_wlocations_copy['country_code'] = np.NaN
song_df_wlocations_copy['country_code'] = song_df_wlocations_copy.country_code.astype(str)

#to be added after spelling correction
#song_df_wlocations_copy.to_csv('song_df_wlocations.csv', encoding='utf-8', index=False)

In [160]:
spelling_corrections = {
    'Jping' : 'Koping'
    ,'Trollhan' : 'Trollhattan'
    ,'Montr' : 'Montreal'
    ,'Saverene' : 'Saverne'
    ,'Brookyln' : 'Brooklyn'
    ,'Halifaz' : 'Halifax'
    ,'Massachusettes' : 'Massachusetts'
    ,'Bundundu' : 'Bandundu'
    ,'Hell' : ''
    ,'Upstate NY' : ''
    ,'East End de Londres' : 'East End of London'
}

for index, row in song_df_wlocations_copy.iterrows():
    #print('Location : ',len(row['artist_location'].split(',')),row['artist_location'])
    location = row['artist_location']
    str1 = ''
    #Handling square brackets []
    pos1 = re.search('\[.*', location)
    #print(str1, pos1)
    if(pos1 != None):
        pos1 = pos1.start() + 1
        str1 = location[pos1:-1]        
        array = location.split(',')
        str1 = str1 + ', ' + array[0]
        #print('1. ',str1)        
        
    if((', Can' in location) or (', Cana' in location)):
        #print(row.artist_location)
        array = location.split(',')
        if(len(array) == 3):
            str1 = array[0].strip()+', '+array[1].strip()+', Canada'
        if(len(array) == 2):
            str1 = array[0].strip()+', Canada'

        #print('2. ',str1)
        #df3_lo.at[index,'loc1']=str1        

    if(', Eng' in location):
        #print(row.artist_location)
        array = row['artist_location'].split(',')
        if(len(array) == 3):
            str1 = array[0].strip()+', '+array[1].strip()+', England'
        if(len(array) == 2):
            str1 = array[0].strip()+', England'
        
        #print('3. ',str1)
        #df3_lo.at[index,'loc1']=str1        

    if(', Lo' in location):
        #print(row.artist_location)
        array = row['artist_location'].split(',')
        if(len(array) == 3):
            str1 = array[0].strip()+', '+array[1].strip()+', London'
        if(len(array) == 2):
            str1 = array[0].strip()+', London'

    if(len(str1) == 0):
        str1 = location
                               
    if('http' in str1):
        str1 = ''
            
    if('/' in str1):
        str1 = str1.replace('/',', ')
        
    if(' - ' in str1):
        str1 = str1.replace(' - ',', ')        
        
    #Assumption is location to be on format city, state, country
    array = str1.split(',')
    if(len(array) == 3):
        str1 = array[0].strip()+', '+array[2].strip()

    #print('4. ',str1)
    #df3_lo.at[index,'loc1']=str1        

    #Remove non-alphanumeric characters
    str2 = re.sub(r'[^a-zA-Z, \.]+', '', str1)
    str1 = correct_spelling(spelling_corrections, str2)
    #print('Final = ', str1)
    if(len(location) != len(str1)):
        print('{}. Analyzing {} and result is {} '.format(index, location, str1))
    song_df_wlocations_copy.at[index,'corrected_location']=str1


1. Analyzing Los Angeles, California, USA and result is Los Angeles, USA 
27. Analyzing NY - Long Island and result is NY, Long Island 
143. Analyzing Vancouver, British Columbia, Cana and result is Vancouver, Canada 
147. Analyzing California - LA and result is California, LA 
152. Analyzing New York, NY [Queens] and result is Queens, New York 
301. Analyzing Braintree, Essex, England and result is Braintree, England 
302. Analyzing İngiltere and result is ngiltere 
331. Analyzing Bangor, County Down, Ireland and result is Bangor, Ireland 
387. Analyzing Paddington, London, England and result is Paddington, London 
427. Analyzing NY - New York City and result is NY, New York City 
443. Analyzing Kingston, Ontario, Canada and result is Kingston, Canada 
480. Analyzing Wellington, Aotearoa, New Zealand and result is Wellington, New Zealand 
481. Analyzing California - LA and result is California, LA 
496. Analyzing Lisboa, Portugal  and result is Lisboa, Portugal 
569. Analyzing Sheffie

In [176]:
print('All = {}'.format(song_df_wlocations_copy.shape))
print('corrected_location not null = {}'.format(song_df_wlocations_copy[song_df_wlocations_copy.corrected_location.notnull()].shape))

All = (2974, 18)
corrected_location not null = (2974, 18)


In [180]:
song_df_wlocations_copy.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
0,ARTH9041187FB43E1F,,"St. Joseph, MO",,Eminem,312,1,SOLXDDC12A6701FBFD,I'm Back,2000,,,"St. Joseph, MO",,,,,
1,ARQFUGM1187FB3E24E,,"Los Angeles, California, USA",,Black Label Society,258,1,SOHVHKM12A8C13F716,Counterfeit God,2000,,,"Los Angeles, USA",,,,,
24,ARJ0AL61187B9A3F27,,Georgia,,Katie Melua,193,1,SOFHFLK12AF72A4FB9,Mary Pickford (Used To Eat Roses),2008,,,Georgia,,,,,
27,ARJ2PMY1187FB5B563,,NY - Long Island,,Burning Spear,334,1,SOVTFUO12A6310D813,The Invasion (Aka Black Wa Da Da),0,,,"NY, Long Island",,,,,
31,ARWR6RK1187FB3AB52,,"Long Island, NY",,From Autumn To Ashes,147,1,SOHKXAC12A58A7F6E5,IV,2002,,,"Long Island, NY",,,,,


In [181]:
#song_df_wlocations_copy.to_csv('song_df_wlocations.csv', encoding='utf-8', index=False)

In [182]:
#song_df_wlocations_copy.to_csv('song_df_wlocations-backup.csv', encoding='utf-8', index=False)

### Query geopy and get data for artist_location by running GatherLocationDetails.ipynb

In [111]:
song_df_wlocations_copy = pd.read_csv("song_df_wlocations.csv")
print('song_df_wlocations.csv = ',song_df_wlocations_copy.shape)

song_df_wlocations.csv =  (2974, 18)


In [112]:
song_df_wlocations_copy[song_df_wlocations_copy.process != 'Y'].shape

(21, 18)

In [87]:
song_df_wlocations_copy[song_df_wlocations_copy.process != 'Y']

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
228,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,1001,251,1,SOYAAAT12AB018C947,Underground,0,,,"Leyton, East End of London",,,,,
293,AREQFUK11F94B58515,,RSC Slauson Boy : CA,,Nipsey Hussle,200,1,SOSPMPH12AB01849F5,One Take Freestyle,0,,,RSC Slauson Boy CA,,,,,
338,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,121,1,SOPMCEZ12A8C136DCD,A Dreamy Day Of Daydreaming Of You,2001,,,Sunlandia,,,,,
507,ARQEFHV1187FB42762,,"Penarth, Wales to Los Angeles",,Jem,282,1,SOSIWPS12A58A7ED14,Maybe I'm Amazed (Album Version),0,,,"Penarth, Wales to Los Angeles",,,,,
822,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,138,1,SODAZBR12A6D4F6FFA,Requiem for O.M.M. (Album Version),0,,,Sunlandia,,,,,
1015,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,201,1,SOZCITS12A8C13C65D,Oslo in the Summertime,2005,,,Sunlandia,,,,,
1277,ARIDCB61187B98952E,,por el mundo cantando.,,Arcangel La Maravilla,188,1,SOARXRJ12AF72AACE9,En el Callejon,0,,,por el mundo cantando.,,,,,
1353,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,Iron Maiden,410,1,SOJKLHZ12A6D4F90FC,These Colours Don't Run,2006,,,"Leyton, East End of London",,,,,
1365,ARO3SS31187FB3729C,,Antarctica,,GWAR,159,1,SOUBXTR12AB0186DF5,Slap U Around,1994,,,Antarctica,,,,,
1432,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,173,1,SODUMOP12A8C136DCB,Penelope,2001,,,Sunlandia,,,,,


In [119]:
#Observation found that is county, state, country is filled but not the corrected_location. So just filling that up
#song_df_wlocations_copy[song_df_wlocations_copy.artist_location.str.contains('http', na=False)]
song_df_wlocations_copy[song_df_wlocations_copy.corrected_location.isnull()]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
2779,ART9KTA1187B9A3956,34.22678,http://www.sneakerpimps.net,70.396426,Sneaker Pimps,298,1,SOQFCCN12AB0182A43,Walk The Rain,1997,,Y,,Chaparhar,,Nangarhar,Afghanistan,af


In [122]:
ix = int(song_df_wlocations_copy[song_df_wlocations_copy.corrected_location.isnull()].index.values)
print(ix)
song_df_wlocations_copy.at[ix,'corrected_location']='Jalalabad Afghanistan'
song_df_wlocations_copy.at[ix,'process']='N'

2779


In [123]:
temp = song_df_wlocations_copy[song_df_wlocations_copy.process != 'Y'].copy()

In [124]:
temp

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
2779,ART9KTA1187B9A3956,34.22678,http://www.sneakerpimps.net,70.396426,Sneaker Pimps,298,1,SOQFCCN12AB0182A43,Walk The Rain,1997,,N,Jalalabad Afghanistan,Chaparhar,,Nangarhar,Afghanistan,af
2953,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,1001,251,1,SOYAAAT12AB018C947,Underground,0,,N,"Leyton, London",,,,,
2954,AREQFUK11F94B58515,,RSC Slauson Boy : CA,,Nipsey Hussle,200,1,SOSPMPH12AB01849F5,One Take Freestyle,0,,N,CA,,,,,
2955,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,121,1,SOPMCEZ12A8C136DCD,A Dreamy Day Of Daydreaming Of You,2001,,N,"Athens, GA",,,,,
2956,ARQEFHV1187FB42762,,"Penarth, Wales to Los Angeles",,Jem,282,1,SOSIWPS12A58A7ED14,Maybe I'm Amazed (Album Version),0,,N,Los Angeles,,,,,
2957,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,138,1,SODAZBR12A6D4F6FFA,Requiem for O.M.M. (Album Version),0,,N,"Athens, GA",,,,,
2958,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,201,1,SOZCITS12A8C13C65D,Oslo in the Summertime,2005,,N,"Athens, GA",,,,,
2959,ARIDCB61187B98952E,,por el mundo cantando.,,Arcangel La Maravilla,188,1,SOARXRJ12AF72AACE9,En el Callejon,0,,N,por el mundo cantando.,,,,,
2960,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,Iron Maiden,410,1,SOJKLHZ12A6D4F90FC,These Colours Don't Run,2006,,N,"Leyton, London",,,,,
2961,ARO3SS31187FB3729C,,Antarctica,,GWAR,159,1,SOUBXTR12AB0186DF5,Slap U Around,1994,,N,Antarctica,,,,,


In [125]:
#Manually updating locations which was missed in earlier correction
temp.corrected_location = temp.corrected_location.replace('Sunlandia','Athens, GA')
temp.corrected_location = temp.corrected_location.replace('RSC Slauson Boy : CA','CA')
temp.corrected_location = temp.corrected_location.replace('Penarth, Wales to Los Angeles','Los Angeles')
temp.corrected_location = temp.corrected_location.replace('Leyton, East End de Londres','Leyton, London')
temp.corrected_location = temp.corrected_location.replace('Knowle West, Bristol, Avon, Engla','Knowle West, England')
temp.corrected_location = temp.corrected_location.replace('Hillside of Vallejo, CA','Vallejo, CA')
temp.corrected_location = temp.corrected_location.replace('Maynardsville, TN','Maynardville, TN')
temp.corrected_location = temp.corrected_location.replace('Baimorto, La Coruna, Spain','La Coruna, Spain')
temp.corrected_location = temp.corrected_location.replace('Guyana, West Indies','Guyana')
temp.process = 'N'

In [126]:
temp

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
2779,ART9KTA1187B9A3956,34.22678,http://www.sneakerpimps.net,70.396426,Sneaker Pimps,298,1,SOQFCCN12AB0182A43,Walk The Rain,1997,,N,Jalalabad Afghanistan,Chaparhar,,Nangarhar,Afghanistan,af
2953,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,1001,251,1,SOYAAAT12AB018C947,Underground,0,,N,"Leyton, London",,,,,
2954,AREQFUK11F94B58515,,RSC Slauson Boy : CA,,Nipsey Hussle,200,1,SOSPMPH12AB01849F5,One Take Freestyle,0,,N,CA,,,,,
2955,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,121,1,SOPMCEZ12A8C136DCD,A Dreamy Day Of Daydreaming Of You,2001,,N,"Athens, GA",,,,,
2956,ARQEFHV1187FB42762,,"Penarth, Wales to Los Angeles",,Jem,282,1,SOSIWPS12A58A7ED14,Maybe I'm Amazed (Album Version),0,,N,Los Angeles,,,,,
2957,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,138,1,SODAZBR12A6D4F6FFA,Requiem for O.M.M. (Album Version),0,,N,"Athens, GA",,,,,
2958,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,201,1,SOZCITS12A8C13C65D,Oslo in the Summertime,2005,,N,"Athens, GA",,,,,
2959,ARIDCB61187B98952E,,por el mundo cantando.,,Arcangel La Maravilla,188,1,SOARXRJ12AF72AACE9,En el Callejon,0,,N,por el mundo cantando.,,,,,
2960,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,Iron Maiden,410,1,SOJKLHZ12A6D4F90FC,These Colours Don't Run,2006,,N,"Leyton, London",,,,,
2961,ARO3SS31187FB3729C,,Antarctica,,GWAR,159,1,SOUBXTR12AB0186DF5,Slap U Around,1994,,N,Antarctica,,,,,


In [127]:
#Replace non-processed corrected_location rows from song_df_wlocations_copy2 with newly corrected location rows
print('Before temp = ',temp.shape)
print('Before song_df_wlocations_copy = ',song_df_wlocations_copy.shape)

song_df_wlocations_copy2 = song_df_wlocations_copy.copy()
song_df_wlocations_copy2 = song_df_wlocations_copy2[song_df_wlocations_copy2.process == 'Y'].copy()
song_df_wlocations_copy2 = song_df_wlocations_copy2.append(temp, sort=False)

print('After temp = ',temp.shape)
print('Afer song_df_wlocations_copy2 = ',song_df_wlocations_copy2.shape)

Before temp =  (22, 18)
Before song_df_wlocations_copy =  (2974, 18)
After temp =  (22, 18)
Afer song_df_wlocations_copy2 =  (2974, 18)


In [128]:
song_df_wlocations_copy2[song_df_wlocations_copy2.process == 'N']

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
2779,ART9KTA1187B9A3956,34.22678,http://www.sneakerpimps.net,70.396426,Sneaker Pimps,298,1,SOQFCCN12AB0182A43,Walk The Rain,1997,,N,Jalalabad Afghanistan,Chaparhar,,Nangarhar,Afghanistan,af
2953,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,1001,251,1,SOYAAAT12AB018C947,Underground,0,,N,"Leyton, London",,,,,
2954,AREQFUK11F94B58515,,RSC Slauson Boy : CA,,Nipsey Hussle,200,1,SOSPMPH12AB01849F5,One Take Freestyle,0,,N,CA,,,,,
2955,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,121,1,SOPMCEZ12A8C136DCD,A Dreamy Day Of Daydreaming Of You,2001,,N,"Athens, GA",,,,,
2956,ARQEFHV1187FB42762,,"Penarth, Wales to Los Angeles",,Jem,282,1,SOSIWPS12A58A7ED14,Maybe I'm Amazed (Album Version),0,,N,Los Angeles,,,,,
2957,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,138,1,SODAZBR12A6D4F6FFA,Requiem for O.M.M. (Album Version),0,,N,"Athens, GA",,,,,
2958,ARYMCJZ1187FB513DA,,Sunlandia,,Of Montreal,201,1,SOZCITS12A8C13C65D,Oslo in the Summertime,2005,,N,"Athens, GA",,,,,
2959,ARIDCB61187B98952E,,por el mundo cantando.,,Arcangel La Maravilla,188,1,SOARXRJ12AF72AACE9,En el Callejon,0,,N,por el mundo cantando.,,,,,
2960,ARHTAPO1187FB5AADE,,"Leyton, East End de Londres",,Iron Maiden,410,1,SOJKLHZ12A6D4F90FC,These Colours Don't Run,2006,,N,"Leyton, London",,,,,
2961,ARO3SS31187FB3729C,,Antarctica,,GWAR,159,1,SOUBXTR12AB0186DF5,Slap U Around,1994,,N,Antarctica,,,,,


In [129]:
#song_df_wlocations_copy2.to_csv('song_df_wlocations.csv', encoding='utf-8', index=False)

### Rerun geopy in GatherLocationDetails.ipynb to find locations of locations PROCESS='N'

In [72]:
song_df_wlocations_copy = pd.read_csv("song_df_wlocations.csv")
print('song_df_wlocations.csv = ',song_df_wlocations_copy.shape)

song_df_wlocations.csv =  (2974, 18)


In [65]:
song_df_wlocations_copy[song_df_wlocations_copy.process != 'Y']

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
2959,ARIDCB61187B98952E,,por el mundo cantando.,,Arcangel La Maravilla,188,1,SOARXRJ12AF72AACE9,En el Callejon,0,,N,por el mundo cantando.,,,,,
2961,ARO3SS31187FB3729C,,Antarctica,,GWAR,159,1,SOUBXTR12AB0186DF5,Slap U Around,1994,,N,Antarctica,,,,,
2963,ARLS8UK1187B9898D8,,Wiädikä,,Phenomden,229,1,SOXUVEB12A8C13B65F,Wänn Lärnemer,0,,N,Wiädikä,,,,,
2965,ARO3SS31187FB3729C,,Antarctica,,GWAR,261,1,SORGVTY12AB018349A,Surf of Syn,1995,,N,Antarctica,,,,,
2966,ARLS8UK1187B9898D8,,Wiädikä,,Phenomden,252,1,SONOTHB12A8C13EFFF,Jetz isch Ziit,0,,N,Wiädikä,,,,,
2967,ARO3SS31187FB3729C,,Antarctica,,GWAR,189,1,SOCAEYG12AB01849C0,Let's Blame The Lightman,1997,,N,Antarctica,,,,,


In [132]:
song_df_wlocations_copy[song_df_wlocations_copy.corrected_location.isnull()]

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code


In [80]:
song_df_wblank.shape

(6707, 11)

In [150]:
song_df_latlon = pd.read_csv("song_df_latlon.csv")
song_df_wlocations = pd.read_csv("song_df_wlocations.csv")
song_df_wblank = pd.read_csv("song_df_wblank.csv")

In [None]:
print('All = ',song_df_copy.shape)
print('Locations = ',song_df_wlocations.shape)
print('Blank = ',song_df_wblank.shape)
print('LatLon= ',song_df_latlon.shape)

In [154]:
#song_df_wblank doesn't have those extra columns
song_df_wblank['process'] = np.NaN
song_df_wblank['process'] = song_df_wblank.process.astype(str)
song_df_wblank['corrected_location'] = np.NaN
song_df_wblank['corrected_location'] = song_df_wblank.corrected_location.astype(str)
song_df_wblank['county'] = np.NaN
song_df_wblank['county'] = song_df_wblank.county.astype(str)
song_df_wblank['city'] = np.NaN
song_df_wblank['city'] = song_df_wblank.city.astype(str)
song_df_wblank['state'] = np.NaN
song_df_wblank['state'] = song_df_wblank.state.astype(str)
song_df_wblank['country'] = np.NaN
song_df_wblank['country'] = song_df_wblank.country.astype(str)
song_df_wblank['country_code'] = np.NaN
song_df_wblank['country_code'] = song_df_wblank.country_code.astype(str)

In [163]:
print('All = ',song_df_copy.shape)
print('Locations = ',song_df_wlocations.shape)
print('Blank = ',song_df_wblank.shape)
print('LatLon= ',song_df_latlon.shape)

All =  (14896, 10)
Locations =  (2974, 18)
Blank =  (6649, 18)
LatLon=  (5277, 18)


In [164]:
#Merging dataframes
song_df_clean = pd.DataFrame()
song_df_clean = song_df_clean.append(song_df_wlocations)
song_df_clean = song_df_clean.append(song_df_wblank)
song_df_clean = song_df_clean.append(song_df_latlon)
print(song_df_clean.shape)

(14900, 18)


In [165]:
#No duplicate records, but there is this 4 extra records(God damnit!).
print('All = ',song_df_clean.shape)
bool_series = song_df_clean.duplicated()
song_unique_df = song_df_clean[~bool_series]
print('Unique = ',song_unique_df.shape)

All =  (14900, 18)
Unique =  (14900, 18)


### Issue 6 : Convert log_df : userId column from float64 to Int

#### Define
Convert userId column from float64 to Int

### Code

In [47]:
log_df_clean = pd.read_csv("log_df_clean-original.csv")

In [48]:
log_df_clean.userId = log_df_clean.userId.replace(np.nan, 0)
log_df_clean['userId'] = log_df_clean.userId.astype(int)

### Test

In [49]:
log_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8056 entries, 0 to 8055
Data columns (total 18 columns):
artist           6820 non-null object
auth             8056 non-null object
firstName        7770 non-null object
gender           7770 non-null object
itemInSession    8056 non-null int64
lastName         7770 non-null object
length           6820 non-null float64
level            8056 non-null object
location         7770 non-null object
method           8056 non-null object
page             8056 non-null object
registration     8056 non-null int64
sessionId        8056 non-null int64
song             6820 non-null object
status           8056 non-null int64
ts               8056 non-null object
userAgent        7770 non-null object
userId           8056 non-null int64
dtypes: float64(1), int64(5), object(12)
memory usage: 1.1+ MB


In [50]:
log_df_clean[log_df_clean.userId.isnull()]

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId


In [45]:
#log_df_clean = log_df_clean.head(2).copy()

In [51]:
log_df_clean.to_csv('log_df_clean.csv', encoding='utf-8', index=False)

### Issue 7 : song_df removing column _merge
#### Define
song_df : Remove column _merge

#### Code

In [61]:
song_df_clean = pd.read_csv("song_df_clean-original.csv")

In [62]:
song_df_clean.drop(['_merge'], axis=1, inplace=True)

In [63]:
song_df_clean.to_csv('song_df_clean.csv', encoding='utf-8', index=False)

#### Test

In [64]:
song_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14900 entries, 0 to 14899
Data columns (total 17 columns):
artist_id             14900 non-null object
artist_latitude       8245 non-null float64
artist_location       8241 non-null object
artist_longitude      8245 non-null float64
artist_name           14900 non-null object
duration              14900 non-null int64
num_songs             14900 non-null int64
song_id               14900 non-null object
title                 14900 non-null object
year                  14900 non-null int64
process               8247 non-null object
corrected_location    2974 non-null object
county                7134 non-null object
city                  6037 non-null object
state                 8002 non-null object
country               8241 non-null object
country_code          8241 non-null object
dtypes: float64(2), int64(3), object(12)
memory usage: 1.9+ MB


### Writing clean datasets

Commenting the below writes so it does not get overwritten by mistake

In [8]:
#song_df_clean.to_csv('song_df_clean.csv', encoding='utf-8', index=False)
#log_df_copy.to_csv('log_df_clean.csv', encoding='utf-8', index=False)

In [54]:
log_df_clean = pd.read_csv("log_df_clean.csv")
song_df_clean = pd.read_csv("song_df_clean.csv")

In [55]:
print('Total rows in log_df_copy = {}'.format(log_df_clean.shape[0]))
print('Total rows in song_df_copy = {}'.format(song_df_clean.shape[0]))

Total rows in log_df_copy = 8056
Total rows in song_df_copy = 14900


In [53]:
log_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8056 entries, 0 to 8055
Data columns (total 18 columns):
artist           6820 non-null object
auth             8056 non-null object
firstName        7770 non-null object
gender           7770 non-null object
itemInSession    8056 non-null int64
lastName         7770 non-null object
length           6820 non-null float64
level            8056 non-null object
location         7770 non-null object
method           8056 non-null object
page             8056 non-null object
registration     8056 non-null int64
sessionId        8056 non-null int64
song             6820 non-null object
status           8056 non-null int64
ts               8056 non-null object
userAgent        7770 non-null object
userId           8056 non-null int64
dtypes: float64(1), int64(5), object(12)
memory usage: 1.1+ MB


In [52]:
log_df_clean.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,The Grass Roots,Logged In,Sara,F,72,Johnson,166.71302,paid,"Winston-Salem, NC",PUT,NextSong,1540809153796,411,Let's Live For Today,200,2018-11-14 00:03:22.796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
1,Stars,Logged In,Sara,F,73,Johnson,298.94485,paid,"Winston-Salem, NC",PUT,NextSong,1540809153796,411,Time Can Never Kill The True Heart,200,2018-11-14 00:06:08.796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
2,Eddie Palmieri,Logged In,Sara,F,74,Johnson,391.83628,paid,"Winston-Salem, NC",PUT,NextSong,1540809153796,411,Nada De Ti,200,2018-11-14 00:11:06.796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
3,The Bravery,Logged In,Sara,F,75,Johnson,168.14975,paid,"Winston-Salem, NC",PUT,NextSong,1540809153796,411,Give In,200,2018-11-14 00:17:37.796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
4,K.U.K.L,Logged In,Sara,F,76,Johnson,181.28934,paid,"Winston-Salem, NC",PUT,NextSong,1540809153796,411,Anna,200,2018-11-14 00:20:25.796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95


In [13]:
song_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14900 entries, 0 to 14899
Data columns (total 18 columns):
artist_id             14900 non-null object
artist_latitude       8245 non-null float64
artist_location       8241 non-null object
artist_longitude      8245 non-null float64
artist_name           14900 non-null object
duration              14900 non-null int64
num_songs             14900 non-null int64
song_id               14900 non-null object
title                 14900 non-null object
year                  14900 non-null int64
_merge                6707 non-null object
process               8247 non-null object
corrected_location    2974 non-null object
county                7134 non-null object
city                  6037 non-null object
state                 8002 non-null object
country               8241 non-null object
country_code          8241 non-null object
dtypes: float64(2), int64(3), object(13)
memory usage: 2.0+ MB


In [14]:
song_df_clean.head()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year,_merge,process,corrected_location,county,city,state,country,country_code
0,ARTH9041187FB43E1F,39.76861,"St. Joseph, MO",-94.846704,Eminem,312,1,SOLXDDC12A6701FBFD,I'm Back,2000,,Y,"St. Joseph, MO",Buchanan County,St. Joseph,Missouri,USA,us
1,ARQFUGM1187FB3E24E,34.053696,"Los Angeles, California, USA",-118.242921,Black Label Society,258,1,SOHVHKM12A8C13F716,Counterfeit God,2000,,Y,"Los Angeles, USA",Los Angeles County,Los Angeles,California,United States of America,us
2,ARJ0AL61187B9A3F27,41.710404,Georgia,44.031081,Katie Melua,193,1,SOFHFLK12AF72A4FB9,Mary Pickford (Used To Eat Roses),2008,,Y,Georgia,Tsalka Municipality,,Lower Kartli,Georgia,ge
3,ARJ2PMY1187FB5B563,40.851723,NY - Long Island,-73.099233,Burning Spear,334,1,SOVTFUO12A6310D813,The Invasion (Aka Black Wa Da Da),0,,Y,"NY, Long Island",Suffolk County,,New York,USA,us
4,ARWR6RK1187FB3AB52,40.851723,"Long Island, NY",-73.099233,From Autumn To Ashes,147,1,SOHKXAC12A58A7F6E5,IV,2002,,Y,"Long Island, NY",Suffolk County,,New York,USA,us
