In [7]:
import pandas as pd
import requests
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import accuracy_score

url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
response = requests.get(url)
data = StringIO(response.text)
spotify_songs = pd.read_csv(data)
spotify_songs = spotify_songs.dropna()
spotify_songs['track_album_release_date'] = pd.to_datetime(spotify_songs['track_album_release_date'], errors='coerce')
spotify_songs['release_year'] = spotify_songs['track_album_release_date'].dt.year.astype('Int64')
spotify_songs = spotify_songs.drop(columns=['track_album_release_date'])
undesired_columns = ['playlist_id', 'playlist_name', 'track_id', 'track_name', 'track_album_name', 'track_album_id', 'track_artist', 'playlist_genre','playlist_subgenre']
spotify_songs = spotify_songs.drop(columns=undesired_columns)

# Add a 'decade' column
spotify_songs['decade'] = (spotify_songs['release_year'] // 10) * 10

# Filter rows with years between 1960 and 2019
spotify_songs = spotify_songs[(spotify_songs['release_year'] >= 1960) & (spotify_songs['release_year'] <= 2019)]
spotify_songs = spotify_songs.drop(columns=['release_year'])

# Filter songs with popularity greater than 50
popular_songs = spotify_songs[spotify_songs['track_popularity'] > 50]

# Assuming 'decade' is your target variable
X = popular_songs.drop(['decade', 'track_popularity'], axis=1)
y = popular_songs['decade']

# Identify all categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# One-hot encode all categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_columns]))
X_encoded.columns = encoder.get_feature_names_out(categorical_columns)

# Resetting index to ensure consistency
X_encoded.reset_index(drop=True, inplace=True)
X.reset_index(drop=True, inplace=True)

# Drop the original categorical columns and concatenate the encoded ones
X = pd.concat([X.drop(categorical_columns, axis=1), X_encoded], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Display the decision tree rules
tree_rules = export_text(clf, feature_names=X.columns.tolist())
print(tree_rules)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




|--- duration_ms <= 234900.00
|   |--- valence <= 0.66
|   |   |--- duration_ms <= 210143.00
|   |   |   |--- danceability <= 0.46
|   |   |   |   |--- loudness <= -9.85
|   |   |   |   |   |--- instrumentalness <= 0.00
|   |   |   |   |   |   |--- acousticness <= 0.00
|   |   |   |   |   |   |   |--- class: 1970.0
|   |   |   |   |   |   |--- acousticness >  0.00
|   |   |   |   |   |   |   |--- speechiness <= 0.06
|   |   |   |   |   |   |   |   |--- valence <= 0.41
|   |   |   |   |   |   |   |   |   |--- valence <= 0.09
|   |   |   |   |   |   |   |   |   |   |--- class: 1990.0
|   |   |   |   |   |   |   |   |   |--- valence >  0.09
|   |   |   |   |   |   |   |   |   |   |--- valence <= 0.23
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- valence >  0.23
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |--- valence >  0.41
|   |   |   |   |   |   |