In [61]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [87]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score


In [63]:
train = pd.read_csv('../input/song-popularity-prediction/train.csv')
test = pd.read_csv('../input/song-popularity-prediction/test.csv')

train.head()

In [64]:
print('Shape of dataframe')
print('===================')
print(train.shape)
print('')
print("No of missing values")
print('=====================')
print(train.isnull().sum())

In [65]:
train.info()

# Handling missing values

In [66]:
# almost 10% of data missing from features having missing values
train.isnull().sum()/len(train)

In [67]:
# filling missing values using interpolation

train = train.interpolate()
train.isnull().sum()

In [68]:
train[train["liveness"].isnull()]

In [69]:
train = train.drop(train[train["liveness"].isnull()].index, axis=0)

In [70]:
train.isnull().sum()

In [71]:
sns.distplot(train["tempo"])

In [72]:
sns.boxplot(train["tempo"])

In [73]:
train.describe()

In [74]:
# finding outliers with Guassian Distribution

def guassian_function(col):
    q1 = train[col].mean() - 3*train[col].std()
    q3 = train[col].mean() + 3*train[col].std()
    return q1, q3, train[col].mean()


guassian_function("song_duration_ms")

In [75]:
# finding outliers with IQR (If it follows Normal/Guassian Diistribution)

def iqr_function(col):
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1 # inter quartile range


    lower_band = Q1 - 1.5*IQR
    upper_band = Q3 + 1.5*IQR
    
    return lower_band, upper_band

iqr_function("song_duration_ms")

If the Distribution is skewed we need to find extreme limits, so we will multiply IQR with 3 instead of 1.5. So Lowe and Upper band wiil be,

> lower_band = Q1 - 3*IQR
> 
> upper_band = Q3 + 3*IQR



# Handling Imbalanced Data

In [76]:
# it is clear that the data is imbalanced 

train["song_popularity"].value_counts()

In [77]:
train["song_popularity"].value_counts().plot( kind="bar")

In [78]:
train_x = train.iloc[:, :-1] # independent varibale
train_y = train.iloc[:, -1]  # dependent varibale

In [79]:
# oversampling using SMOTE

oversample = SMOTE()

X, Y = oversample.fit_resample(train_x, train_y)

In [80]:
# now the dataset is balanced

print(Y.value_counts())

# Making machine learning model


In [81]:
# split the data

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=24)  

In [99]:
%%time

log_reg = LogisticRegression()
forest = RandomForestClassifier()
xgboost = XGBClassifier()


models = {"Logistic Regression":log_reg, "Random Forest Classifier":forest, "XGBClassifier":xgboost}

for model_name, model in models.items():
    
    print(model_name)
    print("=============")
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    auc_score = roc_auc_score(y_test, y_pred)
    
    print("AUC score of {} is {}".format(model_name, auc_score))
    print(" ")


Logistic Regression
=============
AUC score of Logistic Regression is 0.5016310024510092
 
Random Forest Classifier
=============
AUC score of Random Forest Classifier is 0.6656604312687145
 
XGBClassifier
=============

AUC score of XGBClassifier is 0.6762481985608289


## Score of Random forest and XG Boost is high ---> gonna take this and tune the parameter.

## Random forest - (works better with data having outliers, our data having so much outliers)

In [91]:
sub.head()

# Submission

In [93]:
sub = pd.read_csv('../input/song-popularity-prediction/sample_submission.csv')
sub.shape

In [101]:
prediciton = xgboost.predict(test)
sub["song_popularity"] = prediciton
#submission = test[['id', 'song_popularity']]
sub.to_csv('submission.csv', index=False)