<h1> Importing dependencies </h1>

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from keras.layers import *
from keras.models import Model, Sequential, load_model
from keras.optimizers import *
from keras.callbacks import ModelCheckpoint
from keras.activations import *
from keras.layers.advanced_activations import *
from keras import regularizers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
train = pd.read_csv('data/Data_Train.csv')
test = pd.read_csv('data/Data_Test.csv')

train.head()

Unnamed: 0,Unique_ID,Name,Genre,Country,Song_Name,Timestamp,Views,Comments,Likes,Popularity,Followers
0,413890,Hardstyle,danceedm,AU,N-Vitral presents BOMBSQUAD - Poison Spitter (...,2018-03-30 15:24:45.000000,14017,4,499,97,119563
1,249453,Dj Aladdin,danceedm,AU,Dj Aladdin - Old School Hip Hop Quick Mix,2016-06-20 05:58:52.000000,1918,17,49,17,2141
2,681116,Maxximize On Air,danceedm,AU,Maxximize On Air - Mixed by Blasterjaxx - Epis...,2015-05-08 17:45:59.000000,9668,11,312,91,22248
3,387253,GR6 EXPLODE,rbsoul,AU,MC Yago - Tenho Compromisso (DJ R7),2017-06-08 23:50:03.000000,113036,2,2400,76,393655
4,1428029,Tritonal,danceedm,AU,Escape (feat. Steph Jones),2016-09-17 20:50:19.000000,110024,81,3031,699,201030


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78458 entries, 0 to 78457
Data columns (total 11 columns):
Unique_ID     78458 non-null int64
Name          78458 non-null object
Genre         78458 non-null object
Country       78458 non-null object
Song_Name     78457 non-null object
Timestamp     78458 non-null object
Views         78458 non-null int64
Comments      78458 non-null int64
Likes         78458 non-null object
Popularity    78458 non-null object
Followers     78458 non-null int64
dtypes: int64(4), object(7)
memory usage: 6.6+ MB


* Timestamp could have been changed to timeseries dtype.
* Likes, Popularity should be int type. They were object type in given dataset and requires additional
  investigation.

In [5]:
train.Country.value_counts()

AU    78458
Name: Country, dtype: int64

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19615 entries, 0 to 19614
Data columns (total 10 columns):
Unique_ID     19615 non-null int64
Name          19615 non-null object
Genre         19615 non-null object
Country       19615 non-null object
Song_Name     19615 non-null object
Timestamp     19615 non-null object
Comments      19615 non-null int64
Likes         19615 non-null object
Popularity    19615 non-null object
Followers     19615 non-null int64
dtypes: int64(3), object(7)
memory usage: 1.5+ MB


In [7]:
test.Country.value_counts()

AU    19615
Name: Country, dtype: int64

* Both train, test set dtypes looks identical. Combining both the sets for ease of manipulation.
* All the songs were from Australia. So country feature will not have any importance on target prediction.

In [8]:
target = train['Views']
train.drop(['Views'], axis = 1, inplace = True)
print(len(train), len(test))

df = pd.concat([train, test])
df.drop(['Country'], axis=1 , inplace=True)
df.shape

78458 19615


(98073, 9)

In [9]:
df.columns

Index(['Unique_ID', 'Name', 'Genre', 'Song_Name', 'Timestamp', 'Comments',
       'Likes', 'Popularity', 'Followers'],
      dtype='object')

In [10]:
features = ['Name', 'Genre', 'Song_Name', 'Timestamp', 'Comments', 'Likes', 'Popularity', 'Followers']

In [11]:
obj_col = ['Likes', 'Popularity']
for i in obj_col:
    print(f'String values in the {i} column are:', 
          df[df[i].str.contains(pat='[a-zA-Z]', regex=True)][i].str.strip().str[-1].unique())

String values in the Likes column are: ['K' 'M']
String values in the Popularity column are: ['K']


<h3>It is found that Likes, Popularity columns are obj type since they have 'K', 'M' to indicate 10**3, 10**6.</h3>

In [12]:
def col_cleaner(x):
    if ',' in x:
        x = x.replace(',','')
    elif 'K' in x:
        x = x.replace('K','')
        x = float(x)*10**3
    elif 'M' in x:
        x = x.replace('M','')
        x = float(x)*10**6
    return float(x)

In [13]:
for i in obj_col:
    df[i]=df[i].apply(col_cleaner)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98073 entries, 0 to 19614
Data columns (total 9 columns):
Unique_ID     98073 non-null int64
Name          98073 non-null object
Genre         98073 non-null object
Song_Name     98072 non-null object
Timestamp     98073 non-null object
Comments      98073 non-null int64
Likes         98073 non-null float64
Popularity    98073 non-null float64
Followers     98073 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 7.5+ MB


In [27]:
#converting the timestamp column to datetime dtype 
df.Timestamp = pd.to_datetime(df.Timestamp)

In [36]:
#changing the missing song_name with dummy name
print(df[df.Song_Name.isnull()])
df.Song_Name=df.Song_Name.fillna('xxxxx')
df.iloc[[31398]]

Empty DataFrame
Columns: [Unique_ID, Name, Genre, Song_Name, Timestamp, Comments, Likes, Popularity, Followers]
Index: []


Unnamed: 0,Unique_ID,Name,Genre,Song_Name,Timestamp,Comments,Likes,Popularity,Followers
31398,1240617,San Holo,electronic,xxxxx,2015-01-17 23:57:50,540,705.0,178.0,440088
