In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
clean = pd.read_csv('clean.csv')

## Visualize demographics of participants

In [None]:
def visualize_demographics(df: pd.DataFrame = clean) -> None:

    # Assuming 'Country' is the column containing country information
    country_counts = df['Country'].dropna().value_counts()

    # Plotting
    plt.figure(figsize=(10, 6))

    country_counts = country_counts.sort_values(ascending=True)
    country_counts.plot(kind='barh', color='skyblue')

    plt.title('Distribution of Survey Participants by Country')
    plt.xlabel('Number of Participants')
    plt.ylabel('Country')

    # Add grid
    plt.grid(True, linestyle='--', alpha=0.7)

    # Add extra ticks (adjust the values in ticks to your preference)
    extra_ticks = list(range(30))
    plt.xticks(list(plt.yticks()[0]) + extra_ticks)

    plt.show()

visualize_demographics(clean)

#print(clean['Country'].dropna().value_counts())

## Analyze Gender

In [None]:
# Participants' gender description
def count_values(series):
    value_counts = series.value_counts(dropna=False)
    return value_counts.to_dict()

gender_counts = count_values(clean['Gender'])
print(gender_counts)

## Visualize Barplot with mean

In [None]:
def barplot_with_mean(df: pd.DataFrame, numeric_column: str, identifier_column: str = 'ResponseId') -> None:
    """
    Visualize data in a DataFrame using a bar plot with error bars.

    Parameters:
    - df: DataFrame
      The input DataFrame containing the data.
    - numeric_column: str
      The name of the column containing numeric data for visualization.
    - identifier_column: str
      The name of the column containing identifiers (e.g., participant IDs).

    Returns:
    - None (displays the plot)
    """

    # Convert the numeric column to int
    # df[numeric_column] = df[numeric_column].astype(int)

    # Calculate average and standard deviation
    average_value = df[numeric_column].mean()
    print(average_value)

    # Plotting
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=identifier_column, y=numeric_column, data=df.iloc[1:], color='lightblue')

    # Draw a horizontal line for the average value
    ax.axhline(y=average_value, color='red', linestyle='--', label=f'Average {numeric_column}')

    plt.title(f'{numeric_column} Distribution with Average {numeric_column}')
    plt.xlabel(identifier_column)
    plt.ylabel(numeric_column)
    plt.legend()
    plt.show()

## Look at participants' age

In [None]:
# Calculate and plot age demographics
age_df = clean[['ResponseId','Q1']]
age_df.Q1 = age_df.Q1.astype(int)

print(f'Oldest: {age_df.Q1.max()}, Youngest: {age_df.Q1.min()}')
barplot_with_mean(age_df.sort_values(axis=0, by='Q1'), 'Q1', 'ResponseId')

In [None]:
# Plot age distribution in this dataset
plt.figure(figsize=(10, 6))
sns.histplot(data=clean, x='Q1', bins=10, kde=True)

# Calculate the mean
mean_age = clean['Q1'].mean()

# Add the mean line
plt.axvline(x=mean_age, color='red', linestyle='--', label='Mean Age')

plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
# Calculate median and IQR since age is non-normally distributed
median_age = clean['Q1'].median()
iqr = clean['Q1'].quantile(0.75) - clean['Q1'].quantile(0.25)

print(f'Median age: {median_age}, IQR: {iqr}')

Same for hours listened in a week

In [None]:
# Plot age distribution in this dataset
plt.figure(figsize=(10, 6))
sns.histplot(data=clean, x='Q4', bins=10, kde=True)

# Calculate the mean
mean_age = clean['Q4'].mean()

# Add the mean line
plt.axvline(x=mean_age, color='red', linestyle='--', label='Mean Age')

plt.title('Music listening per week')
plt.xlabel('Hours')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
# Calculate median and IQR since hours listened also non-normally distributed
median_hours = clean['Q4'].median()
iqr_hours = clean['Q4'].quantile(0.75) - clean['Q4'].quantile(0.25)

print(f'Median hours: {median_hours}, IQR: {iqr_hours}')

In [None]:
interview = pd.DataFrame({'Age': [23,23,24,26], 'Hours': [10,12,30,30]})

# Plot age distribution in this dataset
plt.figure(figsize=(10, 6))
sns.histplot(data=interview, x='Age', bins=4, kde=True)

# Calculate the mean
mean_age = interview['Age'].mean()

# Add the mean line
plt.axvline(x=mean_age, color='red', linestyle='--', label='Mean Age')

plt.title('Music listening per week')
plt.xlabel('Hours')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
# Calculate median and IQR since hours listened also non-normally distributed
median_hours = interview['Age'].median()
iqr_hours = interview['Age'].quantile(0.75) - interview['Age'].quantile(0.25)

print(f'Median hours: {median_hours}, IQR: {iqr_hours}')

## Music Streaming Services used

In [None]:
def count_answers(series: pd.Series) -> dict[str, int]:
    # Create a dict to store the results
    answers_counts: dict[str, int] = {}

    # Iterate through each row in the specified column
    for value in series:
            
        # Split the values in the column based on comma and remove duplicates
        answers = set(re.split(r',(?!\s*euphoric)', str(value)))#set(str(value).split(','))

        # Update the result DataFrame with the counts
        for answer in answers:
            answers_counts[answer] = answers_counts.get(answer, 0) + 1

    return answers_counts

In [None]:
# Which streaming services participants used mostly
streaming = count_answers(clean['Q3 Multiple Choice'])

In [None]:
def make_percentage(items: list[tuple[str, int]], total: int) -> dict:
    return {item: count/ total * 100 for item, count in items}

In [None]:
# Calculate percentages for which music streaming services people use
total_count = len(clean)
percentages = make_percentage(list(streaming.items()), total_count)
percentages

In [None]:
def visualize_service_distribution(service_percentages):
    services = list(service_percentages.keys())
    percentages = list(service_percentages.values())

    plt.bar(services, percentages, color=['pink', 'green', 'blue', 'gray'])
    plt.xlabel('Streaming Service')
    plt.ylabel('Percentage')
    plt.title('Streaming Service Distribution')
    plt.ylim(0, 100)  # Set y-axis limit to 0-100%
    plt.show()

visualize_service_distribution(percentages)

In [None]:
# Which streaming services participants used mostly
reflected = count_answers(clean['Q10'])
total_count = len(clean)
reflect_all = make_percentage(list(reflected.items()), total_count)
reflect_spot = make_percentage(list(reflected.items()), streaming['Spotify'])
reflect_all, reflect_spot
reflected.items()

## Q10b: People surprised from their insights through reflection

In [None]:
# How many surprised insights
counts = clean.Q10b.dropna().value_counts()

# Calculate surprised insights percentages
percentages = make_percentage(list(counts.items()), len(clean.Q10b.dropna()))
percentages


## Q14: What are people interested in having visualized

In [None]:
interests = count_answers(clean.Q14)

# Show percentages of what people want to have visualized
percentages = make_percentage(list(interests.items()), len(clean))

# print the percentages neatly
for interest, percentage in percentages.items():
    print(f'{interest}: {percentage:.2f}%')


In [None]:
# create a function to sort percentages
def sort_percentages(percentages: dict[str, float]) -> dict[str, float]:
    return dict(sorted(percentages.items(), key=lambda x: x[1], reverse=False))

In [None]:
# create a visualization for the percentages

def visualize_interests(interest_percentages: dict[str, float], title: str):
    interests = list(interest_percentages.keys())#[2:]
    percentages = list(interest_percentages.values())#[2:]

    # Looks better
    interests[3] =  "frequency of listening to\ncertain music / artists"

    plt.barh(interests, percentages)

    # make 'nothing' bar grey
    plt.barh(interests[0], percentages[0], color='grey')
    
    plt.xlabel('Participants interested in choice (in %)')
  
    # Align title so that image compacter
    plt.title(title, loc='right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xlim(0, 100)  # Set y-axis limit to 0-100%

    # Add the data value on head of the bar
    for index, value in enumerate(percentages):
        plt.text(value + 1, index, f"{value:.2f}%")

    plt.show()

visualize_interests(sort_percentages(percentages), 'What insights would participants want to acquire through their music listening history?')


## Q15: What features of music listening interesting

In [None]:

features = count_answers(clean.Q15)

feat_percentages = make_percentage(list(features.items()), len(clean))
visualize_interests(sort_percentages(feat_percentages), "Which music features are participants interested in")

## Q16: Which time periods are people the most interested in having visualized

In [None]:
periods = count_answers(clean.Q16)
per_percentages = make_percentage(list(periods.items()), len(clean))
#sort_percentages(per_percentages)
visualize_interests(sort_percentages(per_percentages), "Which time periods participants are interested in?")

## Q17: Why are people interested in specific time periods

In [None]:
# select all values from Q17 that mention "how" and "change"
c = clean.Q17.dropna()
print(len(c))

# count how many people mentioned "change"
change = c.str.contains('change').values.sum()
print(change)

# Percentage of people explicitly mentioning how changed
print(f"{change/len(c)*100:.2f}% of participants explicitly mentioned being interested in changes in their music listening habits")


In [None]:
# select all values from Q17 that mention "how" and "change"
how_change = clean[clean.Q17.str.contains('how') & clean.Q17.str.contains('change')]

# display all of them
pd.set_option('display.max_colwidth', None)

# Percentage of people explicitly mentioning how changed
how_change_mentions = len(how_change.Q17)
q17 = clean.Q17.dropna().value_counts()
print(f"{len(how_change.Q17)/len(q17)*100:.2f}% of participants explicitly mentioned being interested in how their music listening habits had changed")
