In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load dataset with a different encoding
file_path = 'D:/Spotify.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Display the first few rows of the dataset
print(df.head())

# Check for the 'streams' column
if 'streams' not in df.columns:
    raise KeyError("'streams' column not found in the dataset.")

# Data preprocessing
# Drop columns that are not needed (e.g., 'id', 'name', 'artist' if they exist)
df = df.drop(columns=['id', 'name', 'artist'], errors='ignore')

# Handle missing values if any
df = df.dropna()

# Define features and target variable
# Assuming 'streams' is the target variable
X = df.drop(columns=['streams'])
y = df['streams']

# Convert categorical columns to dummy variables if there are any
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
cross_val_score_mean = np.mean(cross_val_score(model, X_train, y_train, cv=10))

# Custom accuracy calculation
def regression_accuracy(y_true, y_pred, tolerance=0.1):
    accurate_predictions = np.abs((y_true - y_pred) / y_true) < tolerance
    accuracy = np.mean(accurate_predictions)
    return accuracy

accuracy = regression_accuracy(y_test, y_pred)

evaluation_metrics = {
    "R2 Score": r2,
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Root Mean Squared Error": rmse,
    "Cross Validation Score": cross_val_score_mean,
    "Accuracy": accuracy
}

print(evaluation_metrics)

# Function to predict streams based on user input
def predict_streams(user_input):
    # Convert user input to dataframe
    user_df = pd.DataFrame([user_input])
    # Convert categorical columns to dummy variables and align with training data
    user_df = pd.get_dummies(user_df)
    user_df = user_df.reindex(columns=X.columns, fill_value=0)
    # Predict streams
    predicted_streams = model.predict(user_df)
    return predicted_streams[0]

# Example user input
user_input = {
    'acousticness': 0.654,
    'danceability': 0.735,
    'energy': 0.812,
    'instrumentalness': 0.000,
    'liveness': 0.102,
    'loudness': -5.934,
    'speechiness': 0.0461,
    'valence': 0.624,
    'tempo': 123.456
    # Add other features as per your dataset
}

predicted_streams = predict_streams(user_input)
print(f'Predicted Streams: {predicted_streams}')


                        Track                    Album Name          Artist  \
0         MILLION DOLLAR BABY  Million Dollar Baby - Single   Tommy Richman   
1                 Not Like Us                   Not Like Us  Kendrick Lamar   
2  i like the way you kiss me    I like the way you kiss me         Artemas   
3                     Flowers              Flowers - Single     Miley Cyrus   
4                     Houdini                       Houdini          Eminem   

  Release Date          ISRC Streams  Track Score Spotify Streams  \
0    4/26/2024  QM24S2402528       1        725.4     390,470,936   
1     5/4/2024  USUG12400910       2        545.9     323,703,884   
2    3/19/2024  QZJ842400387       3        538.4     601,309,283   
3    1/12/2023  USSM12209777       4        444.9   2,031,280,633   
4    5/31/2024  USUG12403398       5        423.3     107,034,922   

  Spotify Playlist Count Spotify Playlist Reach  ...  SiriusXM Spins  \
0                 30,716            19

KeyError: "'streams' column not found in the dataset."

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['track'].value_counts()