# Lab Assignment One: Exploring Table Data 
**Christopher Cook, Bonita Davis, Anekah Kelley, Davis Lynn**

## 1. Business Understanding 

Test10101010

In [None]:
import pandas as pd

## 2. Data Understanding

In [None]:
df = pd.read_csv()

data_des = pd.DataFrame()

data_des['Attributes'] = df.columns
data_des['Description'] = ['Name of song', 'Name of singer', 'Year released', 'Length of song in milliseconds', 'Musical time signature',
        'Suitability of dancing (0-1)', 'Intensity of the song (0-1)', 'Key of the track (0-11)', 'Loudness of track in dB', 'Modality (major or minor)',
        'Measures the presence of spoken words (0-1)', 'Measure of how acoustic the track is (0-1)', 'Measure of likelihood that track is instrumental (0-1)',
        'Measure of the presence of a live audience (0-1)', 'Measure of musical positiveness (0-1)', 'Speed of the track (BPM)', 'Track popularity (0-100)',
        'Musical genre of the song']
data_des['Scales'] = ['Nominal', 'Nominal', 'Ordinal', 'Ratio', 'Nominal', 'Ratio', 'Ratio', 'Ordinal', 'Ratio', 'Ordinal',
        'Ratio', 'Ratio', 'Ratio', 'Ratio', 'Ratio', 'Ratio', 'Ratio', 'Ordinal']
data_des['Discrete/Continuous'] = ['Discrete', 'Discrete', 'Discrete', 'Continuous', 'Discrete', 'Continuous', 'Continuous', 'Discrete'
        'Continuous', 'Discrete', 'Continuous', 'Continuous', 'Continuous', 'Continuous', 'Continuous', 'Continuous', 'Continuous', 'Discrete']
data_des['Range'] = ['N/A', 'N/A', '1899-2024', '23.0k-3.06m', '3 or 4', '0.0-1.0', '0.0-1.0', '0-11', '-47.4-0.92', '0 or 1', '0.0-1.0', '0.0-1.0', '0.0-1.0', 
        '0.0-1.0', '0.0-1.0', '0-220', '0-100', 'N/A']

data_des

### Attributes
There are 18 different attributes collected in the dataset, so we focused on the attributes that we used for our visualization.

1)  **Year:** This is the year that the track was released. The years range from 1899 to 2024, with the most tracks being from 1974-1980, followed by 1980-1986.
2)  **Key:** Describes the key of the track -- the conversion of which is: *0 = C, 1 = C#, 2 = D, 3 = Ef, 4 = E, 5 = F, 6 = F#, 7 = G, 8 = Af, 9 = A, 10 = Bf, 11 = B*. C and G were the most common keys.
3)  **Mode:** Shows if the key is major (1) or minor (0). The vast majority of songs were in a major key.
4)  **Popularity:** Measures the track's popularity on a scale of 0-100.  The popularity is essentially a bell curve, skewed slightly to the right.
5)  **Genre:** Describes the genre of the song. Pop, metal, country, R&B, and Rock are some of the most common.

## 3. Data Visualization

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

print('Pandas:', pd.__version__)
print('Numpy:',np.__version__)

In [None]:
df = pd.read_csv('ClassicHit.csv')

df.head()

In [None]:
print(df.dtypes)

In [None]:
# Create the JointGrid object
g = sns.JointGrid(data=df, x="Popularity", y="Danceability", space=0)

# Plot the 2D histogram with KDE on the margins
g.plot_marginals(sns.histplot, kde=True, bins=10)

# Set the aspect ratio of the x-axis marginal plot
#g.ax_marg_x.set_aspect(1.0 / g.ax_marg_x.get_data_ratio(), adjustable='box')

# Add the 2D histogram to the central plot
ax = g.ax_joint
sns.histplot(x='Popularity', y='Valence', data=df, bins=10, ax=ax)

# Now add text annotations to the histogram cells
coords = ax.collections[0].get_coordinates()
half_width = (coords[0, 1, 0] - coords[0, 0, 0]) / 2
half_height = (coords[1, 0, 1] - coords[0, 0, 1]) / 2

for v, (xv, yv) in zip(ax.collections[0].get_array().ravel(), coords[:-1, :-1, :].reshape(-1, 2)):
    if not np.ma.is_masked(v):
        ax.text(xv + half_width, yv + half_height, f'{v:.0f}', ha='center', va='center', color='white')

# Display the plot
plt.show()

In [None]:
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline 

# External package: conda install missingno 
import missingno as mn

mn.matrix(df)
plt.title("Not Sorted",fontsize=22)

plt.show()

In [None]:
# let's break up the age variable into a discrete new variable
df['Era'] = pd.cut(df['Year'],
                            [0,1945,1960,1975,1990,2008,2027],
                            labels=['WW2_or_Before','EarlyColdWar','MiddleColdWar','LateColdWar','BeforeObama','AfterObama']) # this creates a new variable
df.Era.describe()

In [None]:
# let's break up the age variable into a discrete new variable
df['Era'] = pd.cut(df['Year'],
                            [0,1985,2027],
                            labels=['Before1985','After1985']) # this creates a new variable
df.Era.describe()

In [None]:
# first group the data
df_grouped = df.groupby(by=['Genre','Era'])

# tabulate survival rates of each group
Popularity_rate = df_grouped.Popularity.sum() / df_grouped.Popularity.count()


plt.figure(figsize=(18, 12))
# show in a bar chart using builtin pandas API
ax = Popularity_rate.plot(kind='barh')
plt.title('Popularity Percentages by Genre and Era')
plt.show()

In [None]:
# the cross tab operator provides an easy way to get these numbers
survival = pd.crosstab([df['Genre'],
                        df['Era']], # categories to cross tabulate
                       df.Mode.astype(bool)) # how to group
plt.figure(figsize=(18, 12))
survival.plot(kind='barh', stacked=True)
plt.show()

In [3]:
# plot overall cross tab with both groups
plt.figure(figsize=(18, 10))
ax1 = plt.subplot(1,3,1)
ax2 = plt.subplot(1,3,2)
ax3 = plt.subplot(1,3,3)

pd.crosstab([df['Era']], # categories to cross tabulate
            df.Mode.astype(bool)).plot(kind='bar', stacked=True, ax=ax1) 

pd.crosstab([df['Genre']], # categories to cross tabulate
            df.Mode.astype(bool)).plot(kind='bar', stacked=True, ax=ax2) 

pd.crosstab([df['Key']], # categories to cross tabulate
            df.Mode.astype(bool)).plot(kind='bar', stacked=True, ax=ax3) 

plt.show()

#FUCK FUCK FUCK

NameError: name 'plt' is not defined

## 4. Dimensionality Reduction