# Assignment 3 - Brent Samaha 

In [30]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import xgboost as xgb
from scipy.stats import randint

# Question 1. Read in data - split 70/30

In [6]:
#Reading in dataset
spotify_data = pd.read_csv("dataset-of-00s.csv")

#Dropping unwanted columns
spotify_data = spotify_data.drop(['track', 'artist', 'uri'], axis=1)

#Splitting dataset into 70/30 train/validation datasets
train_data, val_data = train_test_split(spotify_data, train_size = 0.7, random_state = 13)

#Identifying target variable as y & predictors as x for the train/validation datasets
y_train = train_data.target
x_train = train_data.iloc[:, :-1]

y_val = val_data.target
x_val = val_data.iloc[:, :-1]

# Question 2. Train a decision tree & report AUCs.

In [35]:
params = {"max_depth":range(2, 8), "min_samples_leaf": range(5, 55, 5), "min_samples_split": range(10, 110, 5)}

In [37]:
dtree = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=4, scoring = "roc_auc")

In [38]:
dtree.fit(x_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=4,
             param_grid={'max_depth': range(2, 8),
                         'min_samples_leaf': range(5, 55, 5),
                         'min_samples_split': range(10, 110, 5)},
             scoring='roc_auc')

In [40]:
y_train_prob = dtree.predict_proba(x_train)

y_val_prob = dtree.predict_proba(x_val)

In [47]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
train_auc = metrics.auc(fpr, tpr)
print("The training dataset auc is " , train_auc)

The training dataset auc is  0.9081380050466781


In [48]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
val_auc = metrics.auc(fpr, tpr)
print("The validation AUC is " , val_auc)

The validation AUC is  0.8828783732244829


# Question 3. Train a random forest & report AUCs

In [51]:
params = {"max_depth":range(2, 8), "min_samples_leaf": range(5, 55, 5), "min_samples_split": range(10, 110, 5),
          "max_samples":[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4], "max_features": [2, 3, 4, 5, 6],
          "n_estimators": [100, 150, 200, 250, 300, 350, 400]}

In [52]:
rforest = RandomizedSearchCV(RandomForestClassifier(), params, n_jobs=4, scoring = "roc_auc", n_iter = 300,
                             random_state = 13)

In [53]:
rforest.fit(x_train, y_train)

RandomizedSearchCV(estimator=RandomForestClassifier(), n_iter=300, n_jobs=4,
                   param_distributions={'max_depth': range(2, 8),
                                        'max_features': [2, 3, 4, 5, 6],
                                        'max_samples': [0.1, 0.15, 0.2, 0.25,
                                                        0.3, 0.35, 0.4],
                                        'min_samples_leaf': range(5, 55, 5),
                                        'min_samples_split': range(10, 110, 5),
                                        'n_estimators': [100, 150, 200, 250,
                                                         300, 350, 400]},
                   random_state=13, scoring='roc_auc')

In [54]:
y_train_prob = rforest.predict_proba(x_train)

y_val_prob = rforest.predict_proba(x_val)

In [55]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
train_auc = metrics.auc(fpr, tpr)
print("The training dataset auc is " , train_auc)

The training dataset auc is  0.9313755275900217


In [56]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
val_auc = metrics.auc(fpr, tpr)
print("The validation AUC is " , val_auc)

The validation AUC is  0.9122401148287876


# Question 4. Train a gradient boosting model & report AUCs

In [160]:
params = {"max_depth":range(2, 6), "min_samples_leaf": range(5, 55, 5), "min_samples_split": range(10, 110, 5),
          "subsample":[0.6, 0.7, 0.8], "max_features": [2, 3, 4, 5, 6],
          "n_estimators": [50, 100, 150, 200], "learning_rate": [0.1, 0.2, 0.3]}

In [161]:
gradboost = RandomizedSearchCV(GradientBoostingClassifier(), params, n_jobs=4, scoring = "roc_auc", n_iter = 300,
                               random_state = 13)

In [162]:
gradboost.fit(x_train, y_train)

RandomizedSearchCV(estimator=GradientBoostingClassifier(), n_iter=300, n_jobs=4,
                   param_distributions={'learning_rate': [0.1, 0.2, 0.3],
                                        'max_depth': range(2, 6),
                                        'max_features': [2, 3, 4, 5, 6],
                                        'min_samples_leaf': range(5, 55, 5),
                                        'min_samples_split': range(10, 110, 5),
                                        'n_estimators': [50, 100, 150, 200],
                                        'subsample': [0.6, 0.7, 0.8]},
                   random_state=13, scoring='roc_auc')

In [163]:
y_train_prob = gradboost.predict_proba(x_train)

y_val_prob = gradboost.predict_proba(x_val)

In [164]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
train_auc = metrics.auc(fpr, tpr)
print("The training dataset auc is " , train_auc)

The training dataset auc is  0.9860092740285524


In [165]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
val_auc = metrics.auc(fpr, tpr)
print("The validation AUC is " , val_auc)

The validation AUC is  0.9306962139485023


# Question 5. Train a model with XGBoost & report AUCs

In [166]:
params = {'max_depth': range(2, 6), 
          'n_estimators': [50, 100, 150, 200, 250, 300],
          'subsample': [0.6, 0.7, 0.8],
          'colsample_bytree': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 1], 
          'colsample_bynode': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 1],
          'gamma': [0, 5, 10, 15, 20], 
          'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
          'lambda': [0.1, 0.25, 0.5, 0.75, 1]}

In [167]:
xgbmodel = RandomizedSearchCV(xgb.XGBClassifier(use_label_encoder = False, eval_metric = "logloss"), 
                              params, n_jobs=4, scoring = "roc_auc", n_iter = 300, random_state = 13)

In [168]:
xgbmodel.fit(x_train, y_train)

RandomizedSearchCV(estimator=XGBClassifier(eval_metric='logloss',
                                           use_label_encoder=False),
                   n_iter=300, n_jobs=4,
                   param_distributions={'colsample_bynode': [0.1, 0.15, 0.2,
                                                             0.25, 0.3, 0.35,
                                                             0.4, 0.45, 0.5,
                                                             0.6, 0.7, 0.8, 1],
                                        'colsample_bytree': [0.1, 0.15, 0.2,
                                                             0.25, 0.3, 0.35,
                                                             0.4, 0.45, 0.5,
                                                             0.6, 0.7, 0.8, 1],
                                        'gamma': [0, 5, 10, 15, 20],
                                        'lambda': [0.1, 0.25, 0.5, 0.75, 1],
                                        'learning_rate

In [169]:
xgbmodel.best_params_

{'subsample': 0.6,
 'n_estimators': 250,
 'max_depth': 4,
 'learning_rate': 0.05,
 'lambda': 1,
 'gamma': 0,
 'colsample_bytree': 0.7,
 'colsample_bynode': 0.4}

In [170]:
y_train_prob = xgbmodel.predict_proba(x_train)

y_val_prob = xgbmodel.predict_proba(x_val)

In [171]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
train_auc = metrics.auc(fpr, tpr)
print("The training dataset auc is " , train_auc)

The training dataset auc is  0.9682290403734238


In [172]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
val_auc = metrics.auc(fpr, tpr)
print("The validation AUC is " , val_auc)

The validation AUC is  0.9308018688024408


## From the four models ran on the dataset, the best performing model was the XGBoost model. The XGBoost model had a training AUC of .968 and a validation AUC of .9308. The second best model, the Gradient Boosting Model had a training AUC of .986, but a lower validation AUC of .9306. The XGBoost & Gradient Boosting validation AUCs were close, but the XGBoost performed slightly better when making predictions off of new data.

# Question 6. Find optimal probability based off of F-1 Score. 

In [130]:
import numpy as np

In [173]:
#Using XGBoost model to find the optimal probability as it was the best performing model.

prob = np.arange(.01,1,.01)
prec = []; recall = []; acc = []; f1 = []

In [174]:
for threshold in prob:
    
    #using for loop to go through all values in array of .01-.99
    #if the predicted probability is greater than the current iterable of the array, pred = 1, if less than, pred = 0
    prediction = [1 if prob >= threshold else 0 for prob in y_train_prob[:,1]]
    
    #calculating metrics at each iterable in probability array; actual vs. predicted above
    precision = metrics.precision_score(y_train, prediction)
    rec = metrics.recall_score(y_train, prediction)
    accuracy = metrics.accuracy_score(y_train, prediction)
    f1score = metrics.f1_score(y_train, prediction)
    
    #appending metrics to their respective list
    prec.append(precision)
    recall.append(rec)
    acc.append(accuracy)
    f1.append(f1score)
    
#creating a dictionary to transform into a DF
dic={'threshold':prob, 'precision':prec, 'recall':rec, 'accuracy':acc, 'F1_Score':f1}

In [175]:
probability_metrics = pd.DataFrame(dic)
probability_metrics

Unnamed: 0,threshold,precision,recall,accuracy,F1_Score
0,0.01,0.564810,0.000488,0.616058,0.721889
1,0.02,0.601115,0.000488,0.669343,0.750871
2,0.03,0.625535,0.000488,0.701703,0.769635
3,0.04,0.645131,0.000488,0.725791,0.784141
4,0.05,0.666124,0.000488,0.750122,0.799453
...,...,...,...,...,...
94,0.95,0.994135,0.000488,0.583698,0.283801
95,0.96,0.995575,0.000488,0.556204,0.197889
96,0.97,1.000000,0.000488,0.527251,0.097538
97,0.98,1.000000,0.000488,0.509246,0.029822


In [176]:
result.iloc[result.f1_score.idxmax()]

threshold    0.590000
precision    0.928260
recall       0.935059
accuracy     0.931630
f1_score     0.931647
Name: 58, dtype: float64

In [179]:
#Optimal threshold is .59

pred = [1 if prob >= 0.59 else 0 for prob in y_train_prob[:,1]]

#Finding metrics for training data using optimal threshold of .59
tprec = metrics.precision_score(y_train, pred)
trecall = metrics.recall_score(y_train, pred)
taccuracy = metrics.accuracy_score(y_train, pred)
tf1score = metrics.f1_score(y_train, pred)

print("Training precision: ", tprec)
print("Training recall: ", trecall)
print("Training accuracy: ", taccuracy)
print("Training F1-Score: ", tf1score)

Training precision:  0.8994683421942967
Training recall:  0.90869140625
Training accuracy:  0.9038929440389294
Training F1-Score:  0.904056351712412


In [181]:
pred = [1 if prob >= 0.59 else 0 for prob in y_val_prob[:,1]]

#Finding metrics for validation data using optimal threshold of .59
vprec = metrics.precision_score(y_val, pred)
vrecall = metrics.recall_score(y_val, pred)
vaccuracy = metrics.accuracy_score(y_val, pred)
vf1score = metrics.f1_score(y_val, pred)

print("Validation precision: ", vprec)
print("Validation recall: ", vrecall)
print("Validation accuracy: ", vaccuracy)
print("Validation F1-Score: ", vf1score)

Validation precision:  0.8444444444444444
Validation recall:  0.8558558558558559
Validation accuracy:  0.8479001135073779
Validation F1-Score:  0.8501118568232663


# Question 7. Calculating Gini Impurity

### a. The dataset has 75% positive labels and 25% negative labels.

In [182]:
GI = 1 - (.75)**2 - (.25)**2
GI

0.375

### b. The dataset has 90% positive labels and 10% negative labels 

In [192]:
GI = 1 - (.90)**2 - (.10)**2
GI

0.17999999999999994