# 1. Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy
from scipy.stats import linregress 
import IPython

# 2. Explore Data

In [None]:
# url = 'https://ucsb.box.com/s/fo0sf1ocpzbbbhlfzxmmdyuyickan17x'


full_90s = pd.read_csv('90sOneHitWonders.csv')
full_90s.shape

print(full_90s.head())
print(full_90s.dtypes)
print(full_90s.describe)


# 3. Clean Data
Removed duplicate songs (why would a one hit wonder be in this data set twice?)

In [None]:
real_wonders = full_90s[full_90s['Artist'].duplicated() == False]

duplicate_artists = real_wonders = full_90s[full_90s['Artist'].duplicated() == True]
real_wonders['Artist'].count
print(duplicate_artists)

# 4. Filter Data

In [None]:
full_90s = full_90s.drop_duplicates(subset=['Track'], keep='first')

## Year vs many factors: Exploring and Visualizing
### Danceability A measure of how suitable the song is for dancing (0 to 1).
### Energy: The intensity and activity level of the song (0 to 1).
### Speechiness: The presence of spoken words in a song (0 to 1)
### Acousticness: The likelihood that the song is acoustic
### Liveness: The presence of an audience or live recording (0 to 1)
### Valence: The musical positiveness or mood of the song (0 to 1)


# 5. Sort Data

In [None]:
duration_df = full_90s.copy().sort_values('Duration')
duration_df.head()

# 6. Transform Data

# 7-8. Group and Aggregate Data

In [None]:
# First group by year
# Average each variable by year

yearly_averages = full_90s.groupby('Year').agg({
    'Liveness':'mean',
    'Danceability': 'mean',
    'Energy': 'mean',
    'Speechiness': 'mean',
    'Acousticness': 'mean',
    'Valence': 'mean'
})

yearly_averages.head()

# 9. Visualize Data

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16,8))

fig1 = sns.lineplot(data = yearly_averages, x = 'Year', y = 'Liveness', label = 'Liveness', ax = ax[1], color = "#B3001B")
fig1 = sns.lineplot(data = yearly_averages, x = 'Year', y = 'Danceability', label = 'Danceability', ax = ax[1], color = "#00F5D4")
fig1 = sns.lineplot(data = yearly_averages, x = 'Year', y = 'Energy', label = 'Energy', ax = ax[1], color = "#F15BB5")
fig1 = sns.lineplot(data = yearly_averages, x = 'Year', y = 'Speechiness', label = 'Speechiness', ax = ax[1], color = "#9B5DE5")
fig1 = sns.lineplot(data = yearly_averages, x = 'Year', y = 'Acousticness', label = 'Acousticness', ax = ax[1], color = "#37123C")
fig1 = sns.lineplot(data = yearly_averages, x = 'Year', y = 'Valence', label = 'Valence', ax = ax[1], color = "#FEE440")

fig1.set(ylabel = "Proportion")
corr_matrix = yearly_averages.select_dtypes(include=np.number).corr()
fig2 = sns.heatmap(data = corr_matrix, ax = ax[0], cmap = "BuPu")
fig.suptitle("Exploring Song Attribute Averages with 1990's 'One Hit Wonders' Over the Decade")
fig.tight_layout()


## Valence vs Danceability

In [None]:
fig = plt.figure(figsize = (8,8))
fig3 = sns.regplot(data = full_90s, x = 'Valence', y = 'Danceability', ci = None, scatter_kws = {"color" :"#D741A7"}, line_kws = {"color" : "#5398BE"})

fig3.set(ylabel = "Proportion of Danceability)")
fig3.set(xlabel = "Proportion of Valence")
fig3.set_title("Relationship of Danceability and Valence for 90s Songs")
# slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x = dance_plot.get_lines()[0].get_ydata(),
#                                                                      y = dance_plot.get_lines()[0].get_xdata())

# print(slope, intercept, std_err)



##  Song vs Popularity Graph vs Year

In [None]:
# Opt out of this graph bc they are all popular! 

## Liveness vs Danceability

In [None]:
fig = plt.figure(figsize = (8,8))
fig4 = sns.regplot(data = full_90s, x = 'Liveness', y = 'Danceability', ci = None, scatter_kws = {"color" :"#D741A7"}, line_kws = {"color" : "#5398BE"})

fig4.set(ylabel = "Proportion of Danceability")
fig4.set(xlabel = "Proportion of Liveness")
fig4.set_title("Relationship of Danceability and Liveness for 90s Songs")

# It's interesting that the danceability decreases as songs are "more live." This could be because when a song is performed live, there is usually more instrumental vibes. 
# To investigate this hypothesis, we decided to plot liveness vs acousticness.

## Liveness vs Instrumentalness

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16,8))


fig5 = ax2.hexbin(full_90s['Acousticness'], full_90s['Liveness'], gridsize=(15,15), cmap=plt.cm.Greys_r)

fig6 = ax1.scatter(data = full_90s, x = 'Liveness', y = 'Acousticness')

# fig6.set(xlabel = "Acousticness")
fig.suptitle("Exploring Correlation between Liveness and Acousticness")
fig.supxlabel("Acousticness")
fig.supylabel("Liveness")
fig.tight_layout()

#Looks funky but part of the process ig
#Disproving our point; might be probable that much of this data was not performed in front of a live audience, showing that there is not a strong, positive relationship.