# Import Data

In [9]:
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
db_password = 'Snakefarm'

#Initialize DB string
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/spotify_capstone"

#Create database engine
engine = create_engine(db_string)

# Connection parameters, yours will be different
param_dic = {
    "host"      : "localhost",
    "database"  : "spotify_capstone",
    "user"      : "postgres",
    "password"  : "snakefarm"
}
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df


# Connect to the database
conn = connect(param_dic)
column_names = ['track_name', 'artist_name', 'song_and_artist', 'track_id', 'year', 'valence', 'acoustic', 'danceability', 'duration_ms', 'energy', 'explicit', 'instrument', 'key_value', 'liveness', 'loudness', 'mode_value', 'popularity', 'speechiness', 'tempo']
# Execute the "SELECT *" query
spotify_df = postgresql_to_dataframe(conn, "select * from  spotify_values", column_names)


#Read in new Rolling Stone DF
conn = connect(param_dic)
column_names_bb = ['track_name', 'album', 'artist_name', 'track_id', 'year', 'duration_ms',
       'popularity', 'danceability', 'acoustic', 'energy', 'instrument',
       'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature',
       'explicit', 'valence', 'key_value', 'mode_value', 'billboard_year',
       'index']
# Execute the "SELECT *" query for BB data

billboard_master_df =postgresql_to_dataframe(conn, "select * from rolling_stone_master", column_names_bb)

#Create list of track_ids from billboard
billboard_master_df_id_list = billboard_master_df['track_id'].tolist()

# Filter out billboard songs in spotify_df
inverse_boolean_series = ~spotify_df.track_id.isin(billboard_master_df_id_list)
spotify_filtered_df = spotify_df[inverse_boolean_series]
spotify_filtered_df.head()

#Join billboard and filtered Spotify DF
joined_df = pd.concat([billboard_master_df,spotify_filtered_df], axis=0, ignore_index=True)

#Add billboard top 100 column to joined df
joined_df['top_100'] = 0

for i, track_id in joined_df.track_id.iteritems():
    if track_id in billboard_master_df.track_id.values:
        joined_df['top_100'][i]= 1
        
joined_df['top_100'].value_counts()

joined_df


Connecting to the PostgreSQL database...
Connection successful
Connecting to the PostgreSQL database...
Connection successful


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined_df['top_100'][i]= 1


Unnamed: 0,track_name,album,artist_name,track_id,year,duration_ms,popularity,danceability,acoustic,energy,...,tempo,time_signature,explicit,valence,key_value,mode_value,billboard_year,index,song_and_artist,top_100
0,Shop Around,The Ultimate Collection: Smokey Robinson & The...,Smokey Robinson & The Miracles,6dHAIIaTDAY8Wmh6sQ1lX9,1998,169333,13,0.549,0.692,0.703,...,132.082,4,False,0.908,7,True,2010.0,0.0,,1
1,Buddy Holly,Weezer,Weezer,0gOyllwzM7IvfuYZ903zNv,1994,159226,0,0.556,0.00271,0.92,...,121.138,4,False,0.771,8,True,2010.0,1.0,,1
2,Miss You - 2009 Re-Mastered Digital Version,Some Girls (2009 Re-Mastered),The Rolling Stones,70i1qWaLQeAhlHKOmf3ajr,1978,288666,0,0.794,0.432,0.715,...,109.682,4,False,0.826,9,False,2010.0,2.0,,1
3,The Rising,The Rising,Bruce Springsteen,2czBvzOv3TvnyoW7Ozo7fP,2002,287040,57,0.499,0.0655,0.761,...,110.186,4,False,0.326,10,True,2010.0,3.0,,1
4,Running on Empty,Running on Empty,Jackson Browne,4MZEZz8MqVgvIMXU6AVP22,1977,298093,2,0.531,0.006,0.948,...,136.891,4,False,0.498,9,True,2010.0,4.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170944,"""Der Rosenkavalier* Op.59 / Act 3: """"Zur Stell...",,['Richard Strauss'* 'Alfred Poell'* 'Ludwig We...,0yRjRgvO8kR6E9fehn07tE,1954,308600,0,0.424,0.976,0.448,...,82.35,,False,0.578,5,True,,,"""Der Rosenkavalier* Op.59 / Act 3: """"Zur Stell...",0
170945,Jacôk - Mountain Dancde,,['Krosno Ensemble'],0yVOxC0rsuYapJh7NkMgkX,1954,80827,0,0.462,0.985,0.0949,...,72.953,,False,0.96,2,True,,,Jacôk - Mountain Dancde ['Krosno Ensemble'],0
170946,Easter Hymn,,['Girolamo Cavazzoni'* 'Flor Peeters'],0yZj9jxtCYdzkDBX6LGmrL,1954,138427,0,0.138,0.42,0.0161,...,70.063,,False,0.439,0,False,,,Easter Hymn ['Girolamo Cavazzoni'* 'Flor Peete...,0
170947,Jodi Bolo,,['Arijit Singh'],5wS1sJr2rzh9AKYFpkqqnA,2020,272562,0,0.42,0.696,0.682,...,112.009,,False,0.394,11,False,,,Jodi Bolo ['Arijit Singh'],0


# Function Creation


### Random Forest Function for different decades

In [2]:
#Random Forest Function for decades
import pandas as pd
accuracy_results_df_decade = pd.DataFrame()
def random_forest_func_pprint(joined_df, year):
    #import
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from collections import Counter
    from imblearn.combine import SMOTEENN
    from sklearn.metrics import classification_report
    import pandas as pd
    
    #Create dataframes based on year
    billboard_filtered_df_func = joined_df[(joined_df['billboard_year'] <= (year + 9)) & (joined_df['billboard_year']>= year)]
    spotify_year_filter_df_func = joined_df[(joined_df['year'] <= (year + 9)) & (joined_df['year'] >= (year -3)) & (joined_df['top_100'] == 0)]

    year_joined_df_func = pd.concat([billboard_filtered_df_func,spotify_year_filter_df_func], axis=0, ignore_index=True)
    
    #Create X and Y
    X_year_joined = year_joined_df_func[['valence',
       'acoustic', 'danceability', 'duration_ms', 'energy',
       'instrument', 'key_value', 'liveness', 'loudness',
       'speechiness', 'tempo']]

    y = year_joined_df_func['top_100']

    #SCALE DATA
    data_scaler = StandardScaler()
    X_scaled = data_scaler.fit_transform(X_year_joined)
    current_data_df= pd.DataFrame(X_scaled, columns = X_year_joined.columns)

    #SPLIT INTO TRAINING AND TESTING
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1)
    smote_enn = SMOTEENN(random_state=0)

    #APPLY SMOTEEN SAMPLING
    X_SMOTEEN, y_SMOTEEN = smote_enn.fit_resample(X_scaled, y)
    #Train the Random Forest model
    # Create a random forest classifier.
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
    from imblearn.metrics import classification_report_imbalanced

    rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

    # Fitting the model
    rf_model = rf_model.fit(X_SMOTEEN, y_SMOTEEN)

    #Predict
    y_pred = rf_model.predict(X_test)
    balanced_accuracy_score(y_test, y_pred)

    # Calculating the confusion matrix.
    cm = confusion_matrix(y_test, y_pred)

    # Create a DataFrame from the confusion matrix.
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

    # Calculating the accuracy score.
    acc_score = balanced_accuracy_score(y_test, y_pred)

    # Displaying results of SMOTEEN Random Forest
    # return (f"Confusion Matrix {display(cm_df)} Accuracy Score : {acc_score} Classification Report {classification_report_imbalanced(y_test, y_pred)}" )
    print ('\033[1m' + 'Results for ' + str(year) + 's')
    #print(f"Confusion Matrix {year}")
    display (cm_df)
    print(f"Accuracy Score : {acc_score}")
    print("Classification Report")
    print(classification_report_imbalanced(y_test, y_pred))
#     print(*X_year_joined.columns, sep =', ')
#     print()
#     print(f"Data was originally {Counter(y)} and was SMOTEEN sampled to {Counter(y_SMOTEEN)}.")
#     print()
    #sort features by their importance.
   # display(sorted(zip(rf_model.feature_importances_, X_year_joined.columns), reverse=True))

    # Put results into dataframe
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    results_df = pd.DataFrame(report_dict).transpose()
    results_df['Decade'] = year
    
    global accuracy_results_df_decade
    accuracy_results_df_decade = accuracy_results_df_decade.append(results_df)

### Random Forest Function all years

In [3]:
#Random Forest Function
import pandas as pd
accuracy_results_df_all_years = pd.DataFrame()
def random_forest_func_all_years(data_to_scale):
    #import
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from collections import Counter
    from imblearn.combine import SMOTEENN
    from sklearn.metrics import classification_report
    import pandas as pd

    #SCALE DATA
    data_scaler = StandardScaler()
    X_scaled = data_scaler.fit_transform(data_to_scale)
    current_data_df= pd.DataFrame(X_scaled, columns = data_to_scale.columns)

    #SPLIT INTO TRAINING AND TESTING
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1)
    smote_enn = SMOTEENN(random_state=0)

    #APPLY SMOTEEN SAMPLING
    X_SMOTEEN, y_SMOTEEN = smote_enn.fit_resample(X_scaled, y)
    #Train the Random Forest model
    # Create a random forest classifier.
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
    from imblearn.metrics import classification_report_imbalanced

    rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

    # Fitting the model
    rf_model = rf_model.fit(X_SMOTEEN, y_SMOTEEN)

    #Predict
    y_pred = rf_model.predict(X_test)
    balanced_accuracy_score(y_test, y_pred)

    # Calculating the confusion matrix.
    cm = confusion_matrix(y_test, y_pred)

    # Create a DataFrame from the confusion matrix.
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

    # Calculating the accuracy score.
    acc_score = balanced_accuracy_score(y_test, y_pred)

    # Displaying results of SMOTEEN Random Forest
    # return (f"Confusion Matrix {display(cm_df)} Accuracy Score : {acc_score} Classification Report {classification_report_imbalanced(y_test, y_pred)}" )
    print("Confusion Matrix")
    display (cm_df)
    print(f"Accuracy Score : {acc_score}")
    print("Classification Report")
    print(classification_report_imbalanced(y_test, y_pred))
    print(*data_to_scale.columns, sep =', ')
    print()
    print(f"Data was originally {Counter(y)} and was SMOTEEN sampled to {Counter(y_SMOTEEN)}.")
    print()
    #sort features by their importance.
    display(sorted(zip(rf_model.feature_importances_, data_to_scale.columns), reverse=True))
    
    ## Put results into dataframe
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    results_df = pd.DataFrame(report_dict).transpose()
    results_df['Year'] = '1950-2020'
    
    global accuracy_results_df_all_years
    
    accuracy_results_df_all_years = accuracy_results_df_all_years.append(results_df)

In [4]:
#Random Forest Function for single year
import pandas as pd
accuracy_results_df_single_year = pd.DataFrame()
def random_forest_func_single_year(joined_df, year):
    #import
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from collections import Counter
    from imblearn.combine import SMOTEENN
    from sklearn.metrics import classification_report
    import pandas as pd
    
    #Create dataframes based on year
    billboard_filtered_df_func = joined_df[(joined_df['billboard_year'] == year)]
    spotify_year_filter_df_func = joined_df[(joined_df['year'] <= year) & (joined_df['year'] >= (year -3)) & (joined_df['top_100'] == 0)]

    year_joined_df_func = pd.concat([billboard_filtered_df_func,spotify_year_filter_df_func], axis=0, ignore_index=True)
    
    #Create X and Y
    X_year_joined = year_joined_df_func[['valence',
       'acoustic', 'danceability', 'duration_ms', 'energy',
       'instrument', 'key_value', 'liveness', 'loudness',
       'speechiness', 'tempo']]

    y = year_joined_df_func['top_100']

    #SCALE DATA
    data_scaler = StandardScaler()
    X_scaled = data_scaler.fit_transform(X_year_joined)
    current_data_df= pd.DataFrame(X_scaled, columns = X_year_joined.columns)

    #SPLIT INTO TRAINING AND TESTING
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1)
    smote_enn = SMOTEENN(random_state=0)

    #APPLY SMOTEEN SAMPLING
    X_SMOTEEN, y_SMOTEEN = smote_enn.fit_resample(X_scaled, y)
    #Train the Random Forest model
    # Create a random forest classifier.
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
    from imblearn.metrics import classification_report_imbalanced

    rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

    # Fitting the model
    rf_model = rf_model.fit(X_SMOTEEN, y_SMOTEEN)

    #Predict
    y_pred = rf_model.predict(X_test)
    balanced_accuracy_score(y_test, y_pred)

    # Calculating the confusion matrix.
    cm = confusion_matrix(y_test, y_pred)

    # Create a DataFrame from the confusion matrix.
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

    # Calculating the accuracy score.
    acc_score = balanced_accuracy_score(y_test, y_pred)

    # Displaying results of SMOTEEN Random Forest
    # return (f"Confusion Matrix {display(cm_df)} Accuracy Score : {acc_score} Classification Report {classification_report_imbalanced(y_test, y_pred)}" )
    print ('\033[1m' + 'Results for ' + str(year))
    #print(f"Confusion Matrix {year}")
    display (cm_df)
    print(f"Accuracy Score : {acc_score}")
    print("Classification Report")
    print(classification_report_imbalanced(y_test, y_pred))
    
#     print(*X_year_joined.columns, sep =', ')
#     print()
#     print(f"Data was originally {Counter(y)} and was SMOTEEN sampled to {Counter(y_SMOTEEN)}.")
#     print()
    #sort features by their importance.
   # display(sorted(zip(rf_model.feature_importances_, X_year_joined.columns), reverse=True))
    
    # Put results into dataframe
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    results_df = pd.DataFrame(report_dict).transpose()
    results_df['Year'] = year
    
    global accuracy_results_df_single_year
    accuracy_results_df_single_year = accuracy_results_df_single_year.append(results_df)

# ML Results Model

In [None]:
# Run function for different decades
import pandas as pd
accuracy_results_df_decade = pd.DataFrame()
i = 1950
while i <= 2020:
    random_forest_func_pprint(joined_df, i)
    i +=10

In [6]:
# Run function for all years
import pandas as pd
accuracy_results_df_all_years = pd.DataFrame()

#Remove Popularity, explicit, mode_value
X_minus_pop_mode_explicit = joined_df[['valence',
       'acoustic', 'danceability', 'duration_ms', 'energy',
       'instrument', 'key_value', 'liveness', 'loudness',
       'speechiness', 'tempo']]

y = joined_df['top_100']

#Run function
random_forest_func_all_years(X_minus_pop_mode_explicit)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,42505,101
Actual 1,2,130


Accuracy Score : 0.9912389633555667
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      0.98      1.00      0.99      0.98     42606
          1       0.56      0.98      1.00      0.72      0.99      0.98       132

avg / total       1.00      1.00      0.98      1.00      0.99      0.98     42738

valence, acoustic, danceability, duration_ms, energy, instrument, key_value, liveness, loudness, speechiness, tempo

Data was originally Counter({0: 170449, 1: 500}) and was SMOTEEN sampled to Counter({1: 170064, 0: 162864}).



[(0.1460720019343654, 'acoustic'),
 (0.11725756470671475, 'loudness'),
 (0.09938977984865932, 'instrument'),
 (0.0986672419360318, 'key_value'),
 (0.09156426556373605, 'valence'),
 (0.08795900167659426, 'energy'),
 (0.07845485811549573, 'danceability'),
 (0.07447988874638357, 'speechiness'),
 (0.07154750071941034, 'duration_ms'),
 (0.0702822175435504, 'tempo'),
 (0.06432567920905839, 'liveness')]

In [None]:
# Run function for individual years
import pandas as pd
accuracy_results_df_single_year = pd.DataFrame()
i = 2003
while i <= 2020:
    random_forest_func_single_year(joined_df, i)
    i +=1

## Put results DFs into CSVs


In [32]:
#individual years
accuracy_results_df_single_year.to_csv('classification_report_single_year_rolling_stone_top_500.csv')


In [8]:
#all years
accuracy_results_df_all_years.to_csv('classification_report_all_years_rolling_stone_top_500.csv')

In [None]:
#by decade
accuracy_results_df_decade.to_csv('classification_report_decades_rolling_stone_top_500.csv')