In [1]:
import pandas as pd
# from google.colab import drive
# drive.mount('/content/gdrive')
# data = pd.read_excel('/content/gdrive/MyDrive/SpotifyRegression/datos_merged_1986_2023.xlsx')

data = pd.read_excel('datos_merged_1986_2023.xlsx')


In [2]:
# Columns to keep
columns_to_keep = ['popularity', 'danceability', 'year', 'duration_min', 'valence', 'speechiness', 'loudness', 'energy', 'principal_artist_followers']

# Drop columns not present in the columns_to_keep list
columns_to_drop = [col for col in data.columns if col not in columns_to_keep]
data = data.drop(columns=columns_to_drop, axis=1)

In [3]:
data.dropna(inplace=True)  # Dropping rows with missing values for simplicity

# Separate features (X) and target variable (y)
X = data.drop('popularity', axis=1)  # Features
y = data['popularity']  # Target variable

columns_to_handle_outliers = ['danceability', 'year', 'duration_min', 'valence', 'speechiness', 'loudness', 'energy', 'principal_artist_followers']
# columns_to_handle_outliers = ['valence', 'energy', 'principal_artist_followers']

# Function to handle outliers using IQR method
def handle_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filtering values outside the IQR range
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df

# Handle outliers for selected columns
data = handle_outliers_iqr(data, columns_to_handle_outliers)



In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler

# Assuming 'data' contains your processed DataFrame

# Separating target variable and features
X = data.drop('danceability', axis=1)  # Features
y = data['danceability']  # Target variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler

# Assuming 'X_train', 'X_test', 'y_train', 'y_test' are already defined from previous steps

# Outlier handling - using RobustScaler for outlier-resistant scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating a Linear Regression model
model = LinearRegression()

# Hyperparameter grid for GridSearchCV
param_grid = {
    'fit_intercept': [True, False],
    # Other hyperparameters can be added based on the available options
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')  # Change scoring as needed
grid_search.fit(X_train_scaled, y_train)

# Best parameters found by GridSearchCV
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Evaluate the model with the best parameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

# Model Evaluation
from sklearn.metrics import r2_score, mean_squared_error
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))


Best Parameters: {'fit_intercept': True}
R2 Score: 0.27883314941565585
Mean Squared Error: 0.016379727424861174


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'data' contains your processed DataFrame

# Separating target variable and features
X = data.drop('popularity', axis=1)  # Features
y = data['popularity']  # Target variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the Decision Tree Classifier model
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = dt_classifier.predict(X_test)

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.09085290482076638
Classification Report:
              precision    recall  f1-score   support

          45       0.00      0.00      0.00         0
          46       0.00      0.00      0.00         5
          47       0.40      0.29      0.33         7
          48       0.00      0.00      0.00         4
          49       0.20      0.12      0.15        16
          50       0.14      0.12      0.13        16
          51       0.33      0.40      0.36        10
          52       0.00      0.00      0.00        16
          53       0.25      0.19      0.22        21
          54       0.31      0.29      0.30        34
          55       0.24      0.22      0.23        37
          56       0.24      0.23      0.24        39
          57       0.16      0.16      0.16        51
          58       0.15      0.17      0.16        59
          59       0.08      0.08      0.08        53
          60       0.09      0.11      0.10        57
          61       0.14     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.svm import SVC

# Creating and training the SVM model
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.