<h1> Importing dependencies </h1>

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from keras.layers import *
from keras.models import Model, Sequential, load_model
from keras.optimizers import *
from keras.callbacks import ModelCheckpoint
from keras.activations import *
from keras.layers.advanced_activations import *
from keras import regularizers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
train = pd.read_csv('data/Data_Train.csv')
test = pd.read_csv('data/Data_Test.csv')

train.head()

Unnamed: 0,Unique_ID,Name,Genre,Country,Song_Name,Timestamp,Views,Comments,Likes,Popularity,Followers
0,413890,Hardstyle,danceedm,AU,N-Vitral presents BOMBSQUAD - Poison Spitter (...,2018-03-30 15:24:45.000000,14017,4,499,97,119563
1,249453,Dj Aladdin,danceedm,AU,Dj Aladdin - Old School Hip Hop Quick Mix,2016-06-20 05:58:52.000000,1918,17,49,17,2141
2,681116,Maxximize On Air,danceedm,AU,Maxximize On Air - Mixed by Blasterjaxx - Epis...,2015-05-08 17:45:59.000000,9668,11,312,91,22248
3,387253,GR6 EXPLODE,rbsoul,AU,MC Yago - Tenho Compromisso (DJ R7),2017-06-08 23:50:03.000000,113036,2,2400,76,393655
4,1428029,Tritonal,danceedm,AU,Escape (feat. Steph Jones),2016-09-17 20:50:19.000000,110024,81,3031,699,201030


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78458 entries, 0 to 78457
Data columns (total 11 columns):
Unique_ID     78458 non-null int64
Name          78458 non-null object
Genre         78458 non-null object
Country       78458 non-null object
Song_Name     78457 non-null object
Timestamp     78458 non-null object
Views         78458 non-null int64
Comments      78458 non-null int64
Likes         78458 non-null object
Popularity    78458 non-null object
Followers     78458 non-null int64
dtypes: int64(4), object(7)
memory usage: 6.6+ MB


* Timestamp could have been changed to timeseries dtype.
* Likes, Popularity should be int type. They were object type in given dataset and requires additional
  investigation.

In [4]:
train.Country.value_counts()

AU    78458
Name: Country, dtype: int64

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19615 entries, 0 to 19614
Data columns (total 10 columns):
Unique_ID     19615 non-null int64
Name          19615 non-null object
Genre         19615 non-null object
Country       19615 non-null object
Song_Name     19615 non-null object
Timestamp     19615 non-null object
Comments      19615 non-null int64
Likes         19615 non-null object
Popularity    19615 non-null object
Followers     19615 non-null int64
dtypes: int64(3), object(7)
memory usage: 1.5+ MB


In [6]:
test.Country.value_counts()

AU    19615
Name: Country, dtype: int64

* Both train, test set dtypes looks identical. Combining both the sets for ease of manipulation.
* All the songs were from Australia. So country feature will not have any importance on target prediction.

In [7]:
target = train['Views']
train.drop(['Views'], axis = 1, inplace = True)
print(len(train), len(test))

df = pd.concat([train, test])
df.drop(['Country'], axis=1 , inplace=True)
df.shape

78458 19615


(98073, 9)

In [8]:
df.columns

Index(['Unique_ID', 'Name', 'Genre', 'Song_Name', 'Timestamp', 'Comments',
       'Likes', 'Popularity', 'Followers'],
      dtype='object')

In [9]:
features = ['Name', 'Genre', 'Song_Name', 'Timestamp', 'Comments', 'Likes', 'Popularity', 'Followers']

In [10]:
obj_col = ['Likes', 'Popularity']
for i in obj_col:
    print(f'String values in the {i} column are:', 
          df[df[i].str.contains(pat='[a-zA-Z]', regex=True)][i].str.strip().str[-1].unique())

String values in the Likes column are: ['K' 'M']
String values in the Popularity column are: ['K']


<h3>It is found that Likes, Popularity columns are obj type since they have 'K', 'M' to indicate 10**3, 10**6.</h3>

In [11]:
def col_cleaner(x):
    if ',' in x:
        x = x.replace(',','')
    elif 'K' in x:
        x = x.replace('K','')
        x = float(x)*10**3
    elif 'M' in x:
        x = x.replace('M','')
        x = float(x)*10**6
    return float(x)

In [12]:
for i in obj_col:
    df[i]=df[i].apply(col_cleaner)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98073 entries, 0 to 19614
Data columns (total 9 columns):
Unique_ID     98073 non-null int64
Name          98073 non-null object
Genre         98073 non-null object
Song_Name     98072 non-null object
Timestamp     98073 non-null object
Comments      98073 non-null int64
Likes         98073 non-null float64
Popularity    98073 non-null float64
Followers     98073 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 7.5+ MB


In [14]:
#converting the timestamp column to datetime dtype 
df.Timestamp = pd.to_datetime(df.Timestamp)

In [15]:
#changing the missing song_name with dummy name
print(df[df.Song_Name.isnull()])
df.Song_Name=df.Song_Name.fillna('xxxxx')
df.iloc[[31398]]

       Unique_ID      Name       Genre Song_Name           Timestamp  \
31398    1240617  San Holo  electronic       NaN 2015-01-17 23:57:50   

       Comments  Likes  Popularity  Followers  
31398       540  705.0       178.0     440088  


Unnamed: 0,Unique_ID,Name,Genre,Song_Name,Timestamp,Comments,Likes,Popularity,Followers
31398,1240617,San Holo,electronic,xxxxx,2015-01-17 23:57:50,540,705.0,178.0,440088


In [16]:
col_name = ['Name', 'Genre']
#for i in col_name:
df[col_name] = df[col_name].apply(lambda x : pd.factorize(x, sort=True)[0])

In [17]:
agg_func = {
    'Comments': ['mean','min','max','sum','median'],
    'Likes': ['mean','min','max','sum','median'],
    'Popularity': ['mean','min','max','sum','median'],
    'Followers': ['mean','min','max','sum']
}

agg_df = df.groupby(by='Name').agg(agg_func)
agg_df.columns = [col+'_'+i for col, i in agg_df.columns]
df = pd.merge(df, agg_df, how='left', on='Name')

In [18]:
df['Year'] = df.Timestamp.dt.year
df['Month'] = df.Timestamp.dt.month
df['Date'] = df.Timestamp.dt.date
df['Hour'] = df.Timestamp.dt.hour
df['Minute'] = df.Timestamp.dt.minute
df['Week'] = df.Timestamp.dt.weekofyear
df['Dayofweek'] = df.Timestamp.dt.dayofweek

In [19]:
from datetime import datetime
df['Days_Since_Release'] = (datetime.now() - df.Timestamp).dt.days
df['Weekend'] = (df.Timestamp.dt.dayofweek).astype(int)

In [21]:
df['Likes_per_Follower'] = df.Likes / df.Followers
df['Likes_per_Comment'] = df.Likes / (df.Comments + 1e-5)
df['Likes_per_Popularity'] = df.Likes / (df.Popularity + 1e-5)

df['Comments_per_Follower'] = df.Comments / df.Followers
df['Comments_per_Popularity'] = df.Comments / (df.Popularity + 1e-5)

df['Popularity_per_Follower'] = df.Popularity / df.Followers
df['Total_Engagements'] = df.Comments + df.Likes

In [143]:
#from collections import Counter
#Counter(' '.join(df['Song_Name'].str.replace("[^a-zA-Z0-9]"," ")).split()).most_common(50)

[('Remix', 12051),
 ('The', 8499),
 ('feat', 8426),
 ('DJ', 6678),
 ('MC', 6230),
 ('Mix', 5317),
 ('Premiere', 3289),
 ('I', 2761),
 ('A', 2679),
 ('2018', 2648),
 ('ft', 2621),
 ('You', 2542),
 ('Radio', 2361),
 ('Me', 2289),
 ('x', 2283),
 ('OUT', 2263),
 ('NOW', 2245),
 ('Prod', 2240),
 ('Original', 2227),
 ('s', 2106),
 ('Ft', 1934),
 ('Of', 1920),
 ('by', 1833),
 ('In', 1796),
 ('It', 1679),
 ('e', 1658),
 ('Beat', 1641),
 ('Free', 1527),
 ('FREE', 1522),
 ('2017', 1513),
 ('Love', 1504),
 ('of', 1466),
 ('Preview', 1433),
 ('Official', 1421),
 ('t', 1411),
 ('On', 1360),
 ('X', 1355),
 ('By', 1351),
 ('S', 1305),
 ('Feat', 1305),
 ('De', 1294),
 ('Version', 1291),
 ('My', 1286),
 ('2019', 1263),
 ('the', 1262),
 ('o', 1260),
 ('Live', 1244),
 ('La', 1227),
 ('To', 1194),
 ('No', 1178)]

In [177]:
df.groupby('Name')['Days_Since_Release'].shift(0)

0         683
1        1332
2        1740
3         978
4        1242
         ... 
98068     896
98069    1481
98070     838
98071     895
98072    1657
Name: Days_Since_Release, Length: 98073, dtype: int64

In [179]:
df.loc[df.Name == 0, 'Days_Since_Release']

206      1469
256       991
324      1823
2305     1966
2964      522
         ... 
95823    2029
96727    1823
96785    1665
96904    1930
96910     883
Name: Days_Since_Release, Length: 221, dtype: int64

In [192]:
df = df.sort_values(by=['Name', 'Timestamp'])
df.loc[df.Name == 1, 'Days_Since_Release'].shift(0)- df.loc[df.Name == 1, 'Days_Since_Release'].shift(-1)

24578     41.0
23814     56.0
7731      75.0
74940     71.0
80259      2.0
79497    100.0
83987     16.0
44893     57.0
90660      NaN
Name: Days_Since_Release, dtype: float64

In [204]:
nextal = pd.DataFrame([df.loc[df.Name == 1, 'Days_Since_Release'].shift(0),df.loc[df.Name == 1, 'Days_Since_Release'].shift(-1), df.loc[df.Name == 1, 'Days_Since_Release'].shift(0)- df.loc[df.Name == 1, 'Days_Since_Release'].shift(-1)])
nextal = nextal.fillna(0)
nextal

Unnamed: 0,24578,23814,7731,74940,80259,79497,83987,44893,90660
Days_Since_Release,779.0,738.0,682.0,607.0,536.0,534.0,434.0,418.0,361.0
Days_Since_Release,738.0,682.0,607.0,536.0,534.0,434.0,418.0,361.0,0.0
Days_Since_Release,41.0,56.0,75.0,71.0,2.0,100.0,16.0,57.0,0.0


In [207]:
preval=pd.DataFrame([df.loc[df.Name == 1, 'Days_Since_Release'].shift(0),df.loc[df.Name == 1, 'Days_Since_Release'].shift(1), df.loc[df.Name == 1, 'Days_Since_Release'].shift(1)- df.loc[df.Name == 1, 'Days_Since_Release'].shift(0)])
preval=preval.fillna(0)*-1
preval

Unnamed: 0,24578,23814,7731,74940,80259,79497,83987,44893,90660
Days_Since_Release,-779.0,-738.0,-682.0,-607.0,-536.0,-534.0,-434.0,-418.0,-361.0
Days_Since_Release,-0.0,-779.0,-738.0,-682.0,-607.0,-536.0,-534.0,-434.0,-418.0
Days_Since_Release,-0.0,-41.0,-56.0,-75.0,-71.0,-2.0,-100.0,-16.0,-57.0
