In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [None]:
unpopular = pd.read_csv("Resources/unpopulardata.csv")
popular = pd.read_csv("Resources/2019data_cleaned.csv")

In [None]:
unpopular['popularity'] = 0
popular['popularity'] = 1
unpopular.head()

In [None]:
df = pd.concat([unpopular, popular], ignore_index=True,sort=False)

In [None]:
df = df[['popularity','acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence']]
df = df.dropna(how = 'any')
df.tail()

In [None]:
X = df[['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence']]
y = df["popularity"].values#.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler 
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train) 
X_test_scaled = X_scaler.transform(X_test)

### Decision Tree Model

In [None]:
from sklearn import tree 
clf = tree.DecisionTreeClassifier()

In [None]:
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

In [None]:
predictions = clf.predict(X_test_scaled)

In [None]:
test_result_df = pd.DataFrame({"Predicted": y_test, "Actual": predictions})[["Predicted", "Actual"]]
test_result_df.head()

In [None]:
feature_names = X.columns
sorted(zip(clf.feature_importances_, feature_names), reverse=True)


In [2]:
import graphviz
dot_data = tree.export_graphviz( clf, out_file=None, feature_names=feature_names, class_names=["popular","unpopular"], filled=True, rounded=True, special_characters=True)

import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)

graph = graphviz.Source(dot_data) 
graph


NameError: name 'tree' is not defined

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 
rf = RandomForestClassifier(n_estimators=200) 
rf = rf.fit(X_train_scaled, y_train) 
rf.score(X_test_scaled, y_test)
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)


In [None]:
rf.score(X_test_scaled, y_test)

### Neural Network

In [None]:
from numpy.random import seed 
seed(42)

In [None]:
from keras.utils import to_categorical
y_train_cate = to_categorical(y_train)
y_test_cate = to_categorical(y_test)

In [None]:
from tensorflow.keras.models import Sequential 
model_neural = Sequential()


In [None]:
from tensorflow.keras.layers import Dense 
number_inputs = 13 
number_hidden_nodes = 100 
model_neural.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

for i in range(11):
    model_neural.add(Dense(units=number_hidden_nodes, activation='relu')) 

number_classes = 2 
model_neural.add(Dense(units=number_classes, activation='softmax'))
model_neural.summary()


In [None]:
model_neural.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model_neural.fit( X_train_scaled, y_train_cate, epochs=20, shuffle=True, verbose=2 )
model_loss, model_accuracy = model_neural.evaluate( X_test_scaled, y_test_cate, verbose=2) 
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
train_scores = [] 
test_scores = []
for k in range(1, 20, 2):
      knn = KNeighborsClassifier(n_neighbors=k)
      knn.fit(X_train_scaled, y_train)  
      train_score = knn.score(X_train_scaled, y_train)
      test_score = knn.score(X_test_scaled, y_test) 
      train_scores.append(train_score) 
      test_scores.append(test_score)
      print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")


In [None]:
plt.plot(range(1, 20, 2), train_scores, marker='o') 
plt.plot(range(1, 20, 2), test_scores, marker="x") 
plt.xlabel("k neighbors") 
plt.ylabel("Testing accuracy Score")
plt.show()


In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train) 
print('k=13 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))


In [None]:
plt.savefig("K_Neighbors_model_popularity.png")

### Gaussian Naive Bayes

In [None]:
#from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
gnb.score(X_test_scaled, y_test)

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
model_svc = SVC(kernel='linear') 
model_svc.fit(X_train_scaled, y_train)

from sklearn.metrics import classification_report
predictions = model_svc.predict(X_test_scaled)
print(classification_report(y_test, predictions, target_names=["popular", "unpopular"]))


In [None]:
svm_predictions = model_svc.predict(X_test_scaled) 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, svm_predictions) 
cm

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression 
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


In [None]:
from sklearn.linear_model import LogisticRegressionCV 
classifier = LogisticRegressionCV(multi_class="multinomial")
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV()
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")