# 1 - DEFINE THE PROBLEM
We aim to build a classification model to predict whether a song play was the first time it was aired (`First?` column), based on data from U.S. radio classic rock airplays.

In [None]:
# 2 - IMPORT REQUIRED LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# 3 - LOAD THE DATA
import sqlite3
conn = sqlite3.connect('../datasets/classic_rock.db')
df = pd.read_sql_query('SELECT * FROM rock_plays', conn)
df.head()

In [None]:
# 4 - EDA (Exploratory Data Analysis)
print(df.info())
print(df.isnull().sum())
df.describe(include='all')

In [None]:
# 5 - VISUALIZE THE DATA
sns.countplot(x='First?', data=df)
plt.title('Distribution of First-Time Airplays')
plt.show()

sns.histplot(df['TIME'], bins=30)
plt.title('Distribution of Play Time')
plt.show()

sns.countplot(x='day_of_week', hue='First?', data=df)
plt.title('First Airplays by Day of the Week')
plt.show()

In [None]:
# 6 - PREPROCESS THE DATA
# Drop irrelevant or leak-prone columns
X = df[['Song Clean', 'ARTIST CLEAN', 'CALLSIGN', 'day_of_week']].copy()
y = df['First?']

categorical_features = X.columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

X_processed = preprocessor.fit_transform(X)
print(f'✅ Processed features shape: {X_processed.shape}')

In [None]:
# 7 - SPLIT THE DATA
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
print(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')

# Pie chart
labels = ['Train', 'Test']
sizes = [len(X_train), len(X_test)]
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightgreen'],
        wedgeprops={'edgecolor': 'black'})
plt.title('Train/Test Split')
plt.axis('equal')
plt.show()