In [39]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import pandas as pd 
from config import db_password
import psycopg2
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [40]:
# Create connection to server 
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/songs_data"
#Create Engine
engine = create_engine(db_string)
# Import Table from Database 
songs_df = pd.read_sql_table("songs_normalize" , con=engine)

songs_df.head()


Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,The Chainsmokers,#SELFIE - Original Mix,183750,False,2014,0,0.789,0.915,0,-3.263,1,0.248,0.0135,9e-06,0.0818,0.66,127.955,"pop, Dance/Electronic"
1,will.i.am,#thatPOWER,279506,False,2013,68,0.797,0.608,6,-6.096,0,0.0584,0.00112,7.7e-05,0.0748,0.402,127.999,"hip hop, pop"
2,Eminem,'Till I Collapse,297786,True,2002,85,0.548,0.847,1,-3.237,1,0.186,0.0622,0.0,0.0816,0.1,171.447,hip hop
3,Sean Paul,(When You Gonna) Give It Up to Me (feat. Keysh...,243880,False,2006,58,0.711,0.761,8,-3.04,1,0.225,0.067,0.0,0.041,0.718,95.824,"hip hop, pop"
4,Taylor Swift,...Ready For It?,208186,False,2017,73,0.613,0.764,2,-6.509,1,0.136,0.0527,0.0,0.197,0.417,160.015,pop


In [41]:
songs_df.describe()

Unnamed: 0,duration_ms,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0
mean,228796.748415,2009.555339,59.759629,0.667644,0.719591,5.363237,-5.527312,0.552413,0.102669,0.126739,0.014763,0.181991,0.551453,120.302128
std,38935.47098,5.8904,21.328082,0.140264,0.152333,3.616115,1.927822,0.497367,0.095149,0.170538,0.086126,0.139918,0.220134,26.852023
min,113000.0,1998.0,0.0,0.129,0.0549,0.0,-20.514,0.0,0.0232,1.9e-05,0.0,0.0215,0.0381,60.019
25%,204020.0,2005.0,56.0,0.581,0.622,2.0,-6.4975,0.0,0.0397,0.0132,0.0,0.0889,0.387,99.079
50%,223293.0,2010.0,65.0,0.675,0.735,6.0,-5.29,1.0,0.0602,0.0558,0.0,0.126,0.557,120.046
75%,248566.0,2015.0,73.0,0.765,0.837,8.0,-4.1945,1.0,0.127,0.1735,7.1e-05,0.243,0.727,134.963
max,484146.0,2020.0,89.0,0.975,0.999,11.0,-0.276,1.0,0.576,0.976,0.985,0.853,0.973,210.851


In [42]:
# Checking for Null Values Again
songs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist            2051 non-null   object 
 1   song              2051 non-null   object 
 2   duration_ms       2051 non-null   int64  
 3   explicit          2051 non-null   bool   
 4   year              2051 non-null   int64  
 5   popularity        2051 non-null   int64  
 6   danceability      2051 non-null   float64
 7   energy            2051 non-null   float64
 8   key               2051 non-null   int64  
 9   loudness          2051 non-null   float64
 10  mode              2051 non-null   int64  
 11  speechiness       2051 non-null   float64
 12  acousticness      2051 non-null   float64
 13  instrumentalness  2051 non-null   float64
 14  liveness          2051 non-null   float64
 15  valence           2051 non-null   float64
 16  tempo             2051 non-null   float64


In [43]:
# Data Tyoes
songs_df.dtypes

artist               object
song                 object
duration_ms           int64
explicit               bool
year                  int64
popularity            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object

In [44]:
# Converting "explicit" column to numerical 
songs_df["explicit"] = songs_df["explicit"].replace({True: 1, False: 0})

In [45]:
songs_df.dtypes

artist               object
song                 object
duration_ms           int64
explicit              int64
year                  int64
popularity            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object

In [46]:
# Using 'qcut' to detemine 'Popular =1 and Not popular =0'  
songs_df["popularity"]=pd.qcut(songs_df["popularity"],q=2 ,labels=[0,1] )
songs_df.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,The Chainsmokers,#SELFIE - Original Mix,183750,0,2014,0,0.789,0.915,0,-3.263,1,0.248,0.0135,9e-06,0.0818,0.66,127.955,"pop, Dance/Electronic"
1,will.i.am,#thatPOWER,279506,0,2013,1,0.797,0.608,6,-6.096,0,0.0584,0.00112,7.7e-05,0.0748,0.402,127.999,"hip hop, pop"
2,Eminem,'Till I Collapse,297786,1,2002,1,0.548,0.847,1,-3.237,1,0.186,0.0622,0.0,0.0816,0.1,171.447,hip hop
3,Sean Paul,(When You Gonna) Give It Up to Me (feat. Keysh...,243880,0,2006,0,0.711,0.761,8,-3.04,1,0.225,0.067,0.0,0.041,0.718,95.824,"hip hop, pop"
4,Taylor Swift,...Ready For It?,208186,0,2017,1,0.613,0.764,2,-6.509,1,0.136,0.0527,0.0,0.197,0.417,160.015,pop


In [47]:
# Generate our categorical variable lists
application_cat = songs_df.dtypes[songs_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
songs_df[application_cat].nunique()

artist     827
song      1859
genre       59
dtype: int64

In [48]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(songs_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(application_cat)
encode_df.head()



Unnamed: 0,artist_*NSYNC,artist_112,artist_2 Chainz,artist_21 Savage,artist_2Pac,artist_3 Doors Down,artist_3LW,artist_3OH!3,artist_5 Seconds of Summer,artist_50 Cent,...,"genre_rock, blues","genre_rock, blues, latin","genre_rock, classical","genre_rock, easy listening","genre_rock, metal","genre_rock, pop","genre_rock, pop, Dance/Electronic","genre_rock, pop, metal","genre_rock, pop, metal, Dance/Electronic",genre_set()
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Merge one-hot encoded features and drop the originals
songs_df = songs_df.merge(encode_df,left_index=True, right_index=True)
songs_df = songs_df.drop(application_cat,1)
songs_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,...,"genre_rock, blues","genre_rock, blues, latin","genre_rock, classical","genre_rock, easy listening","genre_rock, metal","genre_rock, pop","genre_rock, pop, Dance/Electronic","genre_rock, pop, metal","genre_rock, pop, metal, Dance/Electronic",genre_set()
0,183750,0,2014,0,0.789,0.915,0,-3.263,1,0.248,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,279506,0,2013,1,0.797,0.608,6,-6.096,0,0.0584,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,297786,1,2002,1,0.548,0.847,1,-3.237,1,0.186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,243880,0,2006,0,0.711,0.761,8,-3.04,1,0.225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,208186,0,2017,1,0.613,0.764,2,-6.509,1,0.136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
songs_df.dtypes

duration_ms                                    int64
explicit                                       int64
year                                           int64
popularity                                  category
danceability                                 float64
                                              ...   
genre_rock, pop                              float64
genre_rock, pop, Dance/Electronic            float64
genre_rock, pop, metal                       float64
genre_rock, pop, metal, Dance/Electronic     float64
genre_set()                                  float64
Length: 2760, dtype: object

In [51]:
# Converting Binary column to numerical 
songs_df["popularity"] = songs_df["popularity"].replace({1: 1, 0: 0})

In [52]:
# Split our preprocessed data into our features and target arrays
y = songs_df["popularity"].values
X = songs_df.drop(["popularity"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Modeling and Training¶


### 1. Nearul Network   

In [62]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#Added 3rd Layer and Additional Neurons 
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  100
hidden_nodes_layer2 = 50
hidden_nodes_layer3 = 30



nn = tf.keras.models.Sequential()

# First hidden layer  
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# # Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 100)               276000    
                                                                 
 dense_17 (Dense)            (None, 50)                5050      
                                                                 
 dense_18 (Dense)            (None, 30)                1530      
                                                                 
 dense_19 (Dense)            (None, 1)                 31        
                                                                 
Total params: 282,611
Trainable params: 282,611
Non-trainable params: 0
_________________________________________________________________


In [63]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [65]:
fit_model=nn.fit(X_train,y_train , epochs=200)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [66]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

17/17 - 1s - loss: 0.6867 - accuracy: 0.5653 - 1s/epoch - 70ms/step
Loss: 0.6866816878318787, Accuracy: 0.5653021335601807


### 2. RandomForestClassifier


In [67]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest model accuracy: 0.624


In [68]:
# Logistic Regression Instance 
log_model = LogisticRegression()
# Fit the model 
log_model.fit(X_train_scaled, y_train)

y_pred = log_model.predict(X_test)

print(f" Logistic Regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic Regressiob model accuracy: 0.505


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
