In [None]:
#Import libraries
import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind
import numpy as np
from collections import Counter
from scipy.stats import chisquare
from statsmodels.stats.proportion import proportions_ztest

df = pd.read_csv('final_data.csv')



In [None]:
# Ensure the Date column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract the year and create a new column
df['Year'] = df['Date'].dt.year


In [None]:
# Hypothesis 1: Genres are linked to specific topics
def test_genre_topic_association(df):
    # Create a contingency table for genres and topics
    contingency_table = pd.crosstab(df['Final_genre'], df['Topic_label'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-square test for genre-topic association:\nChi2 Statistic: {chi2}, p-value: {p}")
    return p < 0.05


print("Hypothesis 1:", "Supported" if test_genre_topic_association(df) else "Not Supported")

In [None]:
# Hypothesis 2: There are more profanities used now than in the 90s


def test_profanity_over_time(df):
    # Label decades
    df['decade'] = np.where(df['Year'] < 2000, '90s', 'Recent')
    
    # Calculate mean of 'Has_swear' by decade (proportion of songs with profanity)
    profanity_proportion = df.groupby('decade')['has_swear'].mean()
    
    # Perform t-test to compare the 90s with Recent decades
    t_stat, p = ttest_ind(
        df[df['decade'] == '90s']['has_swear'], 
        df[df['decade'] == 'Recent']['has_swear']
    )
    
    print("Proportion of songs with profanity by decade:\n", profanity_proportion)
    print("T-statistic:", t_stat, "P-value:", p)
    
    # Return whether profanity increased over time
    return p < 0.05 and profanity_proportion['Recent'] > profanity_proportion['90s']


print("Hypothesis 2:", "Supported" if test_profanity_over_time(df) else "Not Supported")

In [None]:
# Hypothesis 3: Hip Hop/Rap songs have richer vocabulary
def test_vocabulary_richness(df):
    # Filter to Hip Hop/Rap and other genres, then compare unique words
    df_hiphop = df[df['Final_genre'] == 'Hip Hop/Rap']
    df_other = df[df['Final_genre'] != 'Hip Hop/Rap']
    t_stat, p = ttest_ind(df_hiphop['Unique_words'], df_other['Unique_words'])
    print(f"T-test for vocabulary richness in Hip Hop/Rap:\nT Statistic: {t_stat}, p-value: {p}")
    return p < 0.05

print("Hypothesis 3:", "Supported" if test_vocabulary_richness(df) else "Not Supported")

In [None]:
#Hypothesis 4: Pop songs are more often about love and loss than other genres

def test_love_and_loss_songs_pop_vs_others(df):
    # Count songs labeled as "Love and Loss" for each genre
    pop_love_loss_count = df[(df['Final_genre'].str.lower() == 'pop') & (df['Topic_label'] == 'Love and Loss')].shape[0]
    other_love_loss_count = df[(df['Final_genre'].str.lower() != 'pop') & (df['Topic_label'] == 'Love and Loss')].shape[0]

    # Count total songs for pop and other genres
    total_pop_count = df[df['Final_genre'].str.lower() == 'pop'].shape[0]
    total_other_count = df[df['Final_genre'].str.lower() != 'pop'].shape[0]

    # Prepare data for the proportion test
    counts = [pop_love_loss_count, other_love_loss_count]
    nobs = [total_pop_count, total_other_count]

    # Perform proportion z-test
    z_stat, p_value = proportions_ztest(counts, nobs)

    # Output the results
    print(f"Pop 'Love and Loss' Count: {pop_love_loss_count}, Other Genres 'Love and Loss' Count: {other_love_loss_count}")
    print(f"Z-statistic: {z_stat}, p-value: {p_value}")

    # Results interpretation
    if p_value < 0.05:
        print("There are significantly more love songs about 'Love and Loss' in pop than in other genres.")
    else:
        print("There is no significant difference in the number of love songs about 'Love and Loss' in pop compared to other genres.")

# Example usage:
# Ensure your DataFrame `df` is properly formatted
test_love_and_loss_songs_pop_vs_others(df)


In [None]:
# Hypothesis 1: Genres are linked to specific topics
def test_genre_topic_association(df):
    # Create a contingency table for genres and topics
    contingency_table = pd.crosstab(df['Final_genre'], df['Topic_label'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-square test for genre-topic association:\nChi2 Statistic: {chi2}, p-value: {p}")
    return p < 0.05

In [None]:
# Hypothesis 2: There are more profanities used now than in the 90s
def test_profanity_over_time(df, profanity_list):
    # Filter data for the 90s and recent songs
    df['decade'] = np.where(df['Year'] < 2000, '90s', 'Recent')
    # Count profanities
    df['Profanity_count'] = df['Word_frequency'].apply(lambda freq: sum(freq.get(word, 0) for word in profanity_list))
    profanity_counts = df.groupby('decade')['Profanity_count'].mean()
    t_stat, p = ttest_ind(df[df['decade'] == '90s']['Profanity_count'], df[df['decade'] == 'Recent']['Profanity_count'])
    print(f"T-test for profanity usage:\nT Statistic: {t_stat}, p-value: {p}")
    return p < 0.05



In [None]:
# Hypothesis 3: Hip Hop/Rap songs have richer vocabulary
def test_vocabulary_richness(df):
    # Filter to Hip Hop/Rap and other genres, then compare unique words
    df_hiphop = df[df['Final_genre'] == 'Hip Hop/Rap']
    df_other = df[df['Final_genre'] != 'Hip Hop/Rap']
    t_stat, p = ttest_ind(df_hiphop['Unique_words'], df_other['Unique_words'])
    print(f"T-test for vocabulary richness in Hip Hop/Rap:\nT Statistic: {t_stat}, p-value: {p}")
    return p < 0.05




In [None]:
# Hypothesis 4: Some words are very frequent across all genres and times
def test_common_words(df, threshold=0.05):
    all_word_counts = Counter()
    df['Word_frequency'].apply(lambda freq: all_word_counts.update(freq))
    total_songs = len(df)
    common_words = [word for word, count in all_word_counts.items() if count / total_songs > threshold]
    print(f"Common words across all genres and times: {common_words}")
    return common_words



In [None]:
# Run tests
profanity_list = ['fuck', 'shit', 'damn', 'bitch', 'ass', 'fucking', 'nigger', 'nigga', 'cunt', 'dick','asshole']  
print("Hypothesis 1:", "Supported" if test_genre_topic_association(df) else "Not Supported")
print("Hypothesis 2:", "Supported" if test_profanity_over_time(df, profanity_list) else "Not Supported")
print("Hypothesis 3:", "Supported" if test_vocabulary_richness(df) else "Not Supported")
common_words = test_common_words(df)
print(f"Common words across all genres and times: {common_words}")