In [None]:
!pip install pandas scikit-learn

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [None]:
tracks = pd.read_csv("tracks.csv" , index_col = 0, header = [0, 1])  
features = pd.read_csv("features.csv", index_col = 0) 

tracks_filtered = tracks[tracks[("set", "subset")] == "small"]
tracks_id = tracks_filtered.index
genre = tracks_filtered[("track", "genre_top")]
features_filtered = features.reindex(tracks_id)

genre_df = genre.reset_index(name = "genre")
features_df = features_filtered.reset_index()

print(features_df.head())

features_df.drop(['track_id'], axis = 1)
genre_df = genre_df[['genre']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_df, genre_df.values, test_size = 0.2, random_state = 42)

X_train = X_train.astype(float)
y_train = y_train.flatten()

pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('svm', SVC(random_state=42))
]) 

In [None]:
print("Training the model")
pipeline.fit(X_train, y_train)

print("Making predictions")
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=1))

# Dataset Visualization and Analysis

In [None]:
# Import Required Libraries for Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style for better-looking plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Visualization libraries imported successfully!")

In [None]:
# Load and Inspect Dataset
try:
    df_viz = pd.read_csv("tracks.csv", header=[0, 1])  # dataset has hierarchical headers
    print(f"Dataset loaded successfully. Shape: {df_viz.shape}")
    print(f"Columns: {df_viz.columns.tolist()[:10]}...")  # Show first 10 columns
except FileNotFoundError:
    print("Error: tracks.csv not found. Please ensure the file is in the current directory.")
    df_viz = None

In [None]:
# Data Preprocessing and Cleaning
if df_viz is not None:
    # Extract top-level genre (second row of headers has labels)
    if ("track", "genre_top") in df_viz.columns:
        genres = df_viz[("track", "genre_top")]
        print("Successfully found 'genre_top' column")
    else:
        # Try alternative column names
        possible_columns = [col for col in df_viz.columns if 'genre' in str(col).lower()]
        print(f"Available columns with 'genre': {possible_columns}")
        raise KeyError("Could not find 'genre_top' column in tracks.csv")
    
    # Remove any missing values
    genres = genres.dropna()
    print(f"Total tracks with genre information: {len(genres)}")
    
    # Basic statistics
    print(f"Unique genres: {genres.nunique()}")
    print(f"Missing values: {df_viz[('track', 'genre_top')].isnull().sum()}")
else:
    print("Cannot proceed with data preprocessing as dataset was not loaded.")

In [None]:
# Genre Distribution Analysis
if 'genres' in locals() and genres is not None:
    # Count occurrences
    genre_counts = genres.value_counts()
    print(f"Number of unique genres: {len(genre_counts)}")
    print("\nGenre distribution:")
    print(genre_counts)
    
    # Display top 5 and bottom 5 genres
    print(f"\nTop 5 most common genres:")
    for genre, count in genre_counts.head(5).items():
        print(f"  {genre}: {count:,} tracks ({count/len(genres)*100:.1f}%)")
    
    print(f"\nTop 5 least common genres:")
    for genre, count in genre_counts.tail(5).items():
        print(f"  {genre}: {count:,} tracks ({count/len(genres)*100:.1f}%)")
else:
    print("Cannot proceed with analysis - genre data not available.")

In [None]:
# Create Multiple Visualization Plots
if 'genre_counts' in locals() and genre_counts is not None:
    # Create multiple visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # 1. Bar plot
    genre_counts.plot(kind="bar", ax=axes[0,0], color='skyblue')
    axes[0,0].set_title("Track Count per Genre (Bar Plot)", fontsize=14, fontweight='bold')
    axes[0,0].set_xlabel("Genre")
    axes[0,0].set_ylabel("Number of Tracks")
    axes[0,0].tick_params(axis='x', rotation=45)

    # 2. Pie chart (top 8 genres + others)
    top_genres = genre_counts.head(8)
    others_count = genre_counts.tail(len(genre_counts) - 8).sum()
    if others_count > 0:
        pie_data = pd.concat([top_genres, pd.Series([others_count], index=['Others'])])
    else:
        pie_data = top_genres

    axes[0,1].pie(pie_data.values, labels=pie_data.index, autopct='%1.1f%%', startangle=90)
    axes[0,1].set_title("Genre Distribution (Pie Chart)", fontsize=14, fontweight='bold')

    # 3. Horizontal bar plot (sorted)
    genre_counts.plot(kind="barh", ax=axes[1,0], color='lightcoral')
    axes[1,0].set_title("Track Count per Genre (Horizontal)", fontsize=14, fontweight='bold')
    axes[1,0].set_xlabel("Number of Tracks")
    axes[1,0].set_ylabel("Genre")

    # 4. Statistics summary
    axes[1,1].axis('off')
    stats_text = f"""
Dataset Statistics:

Total Tracks: {len(genres):,}
Unique Genres: {len(genre_counts)}

Top 5 Genres:
{chr(10).join([f"{genre}: {count:,} ({count/len(genres)*100:.1f}%)" 
               for genre, count in genre_counts.head(5).items()])}

Least Common Genre:
{genre_counts.index[-1]}: {genre_counts.iloc[-1]:,} tracks

Average tracks per genre: {genre_counts.mean():.0f}
Median tracks per genre: {genre_counts.median():.0f}
"""
    axes[1,1].text(0.1, 0.9, stats_text, transform=axes[1,1].transAxes, 
                   fontsize=12, verticalalignment='top', fontfamily='monospace',
                   bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8))

    plt.tight_layout()
    plt.show()
else:
    print("Cannot create visualizations - genre count data not available.")

In [None]:
# Generate Detailed Bar Chart with Labels
if 'genre_counts' in locals() and genre_counts is not None:
    # Additional detailed bar plot for better readability
    plt.figure(figsize=(12, 8))
    bars = plt.bar(range(len(genre_counts)), genre_counts.values, color='steelblue', alpha=0.8)
    plt.title("Detailed Track Count per Genre", fontsize=16, fontweight='bold', pad=20)
    plt.xlabel("Genre", fontsize=12)
    plt.ylabel("Number of Tracks", fontsize=12)
    plt.xticks(range(len(genre_counts)), genre_counts.index, rotation=45, ha='right')

    # Add value labels on bars
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                 f'{int(height):,}', ha='center', va='bottom', fontsize=10)

    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Cannot create detailed chart - genre count data not available.")

In [None]:
# Save Visualizations
if 'genre_counts' in locals() and genre_counts is not None:
    # Recreate and save the comprehensive plot
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Recreate all plots for saving
    genre_counts.plot(kind="bar", ax=axes[0,0], color='skyblue')
    axes[0,0].set_title("Track Count per Genre (Bar Plot)", fontsize=14, fontweight='bold')
    axes[0,0].set_xlabel("Genre")
    axes[0,0].set_ylabel("Number of Tracks")
    axes[0,0].tick_params(axis='x', rotation=45)
    
    top_genres = genre_counts.head(8)
    others_count = genre_counts.tail(len(genre_counts) - 8).sum()
    if others_count > 0:
        pie_data = pd.concat([top_genres, pd.Series([others_count], index=['Others'])])
    else:
        pie_data = top_genres
    
    axes[0,1].pie(pie_data.values, labels=pie_data.index, autopct='%1.1f%%', startangle=90)
    axes[0,1].set_title("Genre Distribution (Pie Chart)", fontsize=14, fontweight='bold')
    
    genre_counts.plot(kind="barh", ax=axes[1,0], color='lightcoral')
    axes[1,0].set_title("Track Count per Genre (Horizontal)", fontsize=14, fontweight='bold')
    axes[1,0].set_xlabel("Number of Tracks")
    axes[1,0].set_ylabel("Genre")
    
    axes[1,1].axis('off')
    stats_text = f"""Dataset Statistics:

Total Tracks: {len(genres):,}
Unique Genres: {len(genre_counts)}

Top 5 Genres:
{chr(10).join([f"{genre}: {count:,} ({count/len(genres)*100:.1f}%)" 
               for genre, count in genre_counts.head(5).items()])}

Least Common Genre:
{genre_counts.index[-1]}: {genre_counts.iloc[-1]:,} tracks

Average tracks per genre: {genre_counts.mean():.0f}
Median tracks per genre: {genre_counts.median():.0f}"""
    
    axes[1,1].text(0.1, 0.9, stats_text, transform=axes[1,1].transAxes, 
                   fontsize=12, verticalalignment='top', fontfamily='monospace',
                   bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8))
    
    plt.tight_layout()
    plt.savefig("genre_distribution_complete.png", dpi=300, bbox_inches='tight')
    print("Comprehensive visualization saved as 'genre_distribution_complete.png'")
    plt.show()
    
    # Save detailed bar chart
    plt.figure(figsize=(12, 8))
    bars = plt.bar(range(len(genre_counts)), genre_counts.values, color='steelblue', alpha=0.8)
    plt.title("Detailed Track Count per Genre", fontsize=16, fontweight='bold', pad=20)
    plt.xlabel("Genre", fontsize=12)
    plt.ylabel("Number of Tracks", fontsize=12)
    plt.xticks(range(len(genre_counts)), genre_counts.index, rotation=45, ha='right')
    
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                 f'{int(height):,}', ha='center', va='bottom', fontsize=10)
    
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig("genre_distribution_detailed.png", dpi=300, bbox_inches='tight')
    print("Detailed bar chart saved as 'genre_distribution_detailed.png'")
    plt.show()
else:
    print("Cannot save visualizations - genre count data not available.")