### ARIMA using Monthly Data

In [45]:
#!pip install pandas
#!pip install numpy
#!pip install statsmodels
#!pip install scikit-learn

In [44]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [67]:
artist_data = pd.read_csv('Data/Artist_Pop_App.csv')

In [4]:
# Aggregate data by Artist, Year, and Month
time_series_data = artist_data.groupby(['Year', 'Month', 'Artist (Ind.)'])['Appearances'].sum().reset_index()

# Ensure a consistent time series for each artist by filling missing months
artists = time_series_data['Artist (Ind.)'].unique()
filled_data = []

In [5]:
for artist in artists:
    artist_data_filtered = time_series_data[time_series_data['Artist (Ind.)'] == artist]
    artist_data_filtered['Date'] = pd.to_datetime(artist_data_filtered[['Year', 'Month']].assign(Day=1))
    artist_data_full = artist_data_filtered.set_index('Date').asfreq('MS', fill_value=0).reset_index()
    artist_data_full['Artist (Ind.)'] = artist
    filled_data.append(artist_data_full)

time_series_data_full = pd.concat(filled_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_data_filtered['Date'] = pd.to_datetime(artist_data_filtered[['Year', 'Month']].assign(Day=1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_data_filtered['Date'] = pd.to_datetime(artist_data_filtered[['Year', 'Month']].assign(Day=1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  art

#### Fiting in the Model

In [6]:
# Time series prediction for each artist
forecast_results = []

for artist in artists:
    # Filter data for the artist
    artist_data_filtered = time_series_data_full[time_series_data_full['Artist (Ind.)'] == artist]
    artist_data_filtered.set_index('Date', inplace=True)
    ts_data = artist_data_filtered['Appearances']
    
    # Skip artists with less than 12 data points
    if len(ts_data) < 12:
        forecast_results.append({
            'Artist': artist,
            'Total Appearances 2024': 0  # Default to 0 if insufficient data
        })
        continue

    # Fit the ARIMA model
    try:
        model = ARIMA(ts_data, order=(1, 1, 1))  # Order can be adjusted (p, d, q)
        model_fit = model.fit()
        
        # Forecast for the year 2024
        forecast = model_fit.forecast(steps=12)  # Forecasting 12 months into the future
        total_appearances_2024 = forecast.sum()

        forecast_results.append({
            'Artist': artist,
            'Total Appearances 2024': total_appearances_2024
        })

    except Exception as e:
        # Handle cases where ARIMA fails to converge
        forecast_results.append({
            'Artist': artist,
            'Total Appearances 2024': 0
        })
        print(f"ARIMA failed for {artist}: {e}")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

In [42]:
# Evaluation
# Aggregate and prepare the data
artist_data['Date'] = pd.to_datetime(artist_data[['Year', 'Month']].assign(Day=1))
time_series_data = artist_data.groupby(['Date', 'Artist (Ind.)']).mean().reset_index()

# Select the single artist for analysis
artist = 'Drake'  # Replace with the artist name you want to analyze
artist_df = time_series_data[time_series_data['Artist (Ind.)'] == artist]

# Prepare the data for ARIMA
arima_data = artist_df.set_index('Date')['Appearances']  # Use 'Appearances' as the time series

# Train-Test Split
train_data = arima_data[arima_data.index < '2023-01-01']  # Data before 2023 for training
test_data = arima_data[arima_data.index >= '2023-01-01']  # Data in 2023 onwards for testing

# Skip evaluation if not enough data
if len(test_data) == 0:
    print("Not enough test data for evaluation.")
else:
    # Train the ARIMA model
    model = ARIMA(train_data, order=(1, 1, 1))  # Adjust (p, d, q) as needed
    model_fit = model.fit()

    # Forecast for the test period
    forecast_steps = len(test_data)
    forecast = model_fit.get_forecast(steps=forecast_steps)
    forecast_values = forecast.predicted_mean
    forecast_index = test_data.index

    # Evaluation
    actuals = test_data.values
    predictions = forecast_values.values

    # Calculate Baseline (mean of training data)
    baseline_predictions = np.full_like(actuals, train_data.mean())
    baseline_mae = mean_absolute_error(actuals, baseline_predictions)
    print(f"Baseline MAE: {baseline_mae}")

    # Calculate MAE and RMSE
    mae = mean_absolute_error(actuals, predictions)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))

    # Display Metrics
    metrics_table = pd.DataFrame({
        'Metric': ['Mean Absolute Error (MAE)', 'Root Mean Squared Error (RMSE)'],
        'Value': [mae, rmse]
    })

    print(metrics_table)


Baseline MAE: 44.32777777777777
                           Metric      Value
0       Mean Absolute Error (MAE)  22.557980
1  Root Mean Squared Error (RMSE)  28.886867


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [7]:
# Combine all results into a DataFrame
forecast_summary = pd.DataFrame(forecast_results)

forecast_summary = forecast_summary.sort_values(by='Total Appearances 2024', ascending=False)


#forecast_summary.to_csv('Data/ARIMA_Appearances.csv', index=False)

In [8]:
forecast_summary.head(20)

Unnamed: 0,Artist,Total Appearances 2024
203,The Weeknd,3668.077949
328,Bad Bunny,2659.68923
944,Feid,2626.902497
197,Taylor Swift,2443.338214
633,Rich Brian,1903.152007
59,Eminem,1693.198066
56,Ed Sheeran,1546.899892
788,a-ha,1352.247143
421,KAROL G,1340.500546
53,Drake,1306.287523


In [56]:
avg = pd.read_csv('Data/ARIMA_Appearances.csv')
avg = avg.sort_values(by = 'Total Appearances 2024', ascending = False)
avg = avg.head(20)
#avg = forecast_summary.head(20)

In [57]:
top20 = list(avg['Artist'])

In [58]:
top20

['The Weeknd',
 'Bad Bunny',
 'Feid',
 'Taylor Swift',
 'Rich Brian',
 'Eminem',
 'Ed Sheeran',
 'a-ha',
 'KAROL G',
 'Drake',
 'Metro Boomin',
 'Arctic Monkeys',
 'Harry Styles',
 'Mitchell Ayres & His Orchestra',
 '21 Savage',
 'Coldplay',
 'Imagine Dragons',
 'Pop Smoke',
 'Junior H',
 'SZA']

In [59]:
# Filter for the listed artists
filtered_data = artist_data[artist_data['Artist (Ind.)'].isin(top20)]

# Calculate average 'Points (Total)' for each of these artists
average_points = filtered_data.groupby('Artist (Ind.)')['Points (Total)'].mean().reset_index()
average_points.rename(columns={'Points (Total)': 'Avg Points (Total)'}, inplace=True)
average_points = average_points.sort_values(by='Avg Points (Total)', ascending=False)
average_points = average_points.rename(columns = {'Artist (Ind.)' : 'Artist'})

In [60]:
# Merge the datasets if necessary (e.g., based on 'Artist (Ind.)')
avg = avg.merge(average_points, on='Artist', how='inner')

In [61]:
avg['Weighted Points'] = avg['Total Appearances 2024'] * avg['Avg Points (Total)']

In [62]:
avg

Unnamed: 0,Artist,Total Appearances 2024,Avg Points (Total),Weighted Points
0,The Weeknd,3668.077949,111.302271,408265.405014
1,Bad Bunny,2659.68923,115.978593,308467.013622
2,Feid,2626.902496,104.720901,275091.594937
3,Taylor Swift,2443.338213,99.284632,242585.935011
4,Rich Brian,1903.275046,14.116667,26867.899395
5,Eminem,1693.198065,61.040006,103352.820156
6,Ed Sheeran,1546.899934,91.160997,141016.940233
7,a-ha,1352.24758,27.77381,37557.066728
8,KAROL G,1340.500699,112.695287,151068.110584
9,Drake,1306.287523,96.946805,126640.401788


In [63]:
avg = avg.sort_values(by='Weighted Points',ascending = False)

In [64]:
avg

Unnamed: 0,Artist,Total Appearances 2024,Avg Points (Total),Weighted Points
0,The Weeknd,3668.077949,111.302271,408265.405014
1,Bad Bunny,2659.68923,115.978593,308467.013622
2,Feid,2626.902496,104.720901,275091.594937
3,Taylor Swift,2443.338213,99.284632,242585.935011
8,KAROL G,1340.500699,112.695287,151068.110584
6,Ed Sheeran,1546.899934,91.160997,141016.940233
9,Drake,1306.287523,96.946805,126640.401788
12,Harry Styles,996.957582,110.216453,109881.128953
13,Mitchell Ayres & His Orchestra,989.320366,108.247863,107091.815735
5,Eminem,1693.198065,61.040006,103352.820156


In [65]:
avg.to_csv('Data/ARIMA_Weighted_pts.csv')

### ARIMA using Daily Data

In [26]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

In [27]:
# Load dataset
artist_data = pd.read_csv('Dataset/Final_data.csv')  # Replace with your dataset path

In [28]:
# Aggregate data by Artist and Date
time_series_data = artist_data.groupby(['Date', 'artist'])['points'].sum().reset_index()

# Convert the 'Date' column to datetime format
time_series_data['Date'] = pd.to_datetime(time_series_data['Date'])

# Ensure a consistent time series for each artist by filling missing dates
artists = time_series_data['artist'].unique()
filled_data = []

In [29]:
for artist in artists:
    artist_data_filtered = time_series_data[time_series_data['artist'] == artist]
    artist_data_filtered.set_index('Date', inplace=True)
    artist_data_full = artist_data_filtered.asfreq('D', fill_value=0).reset_index()
    artist_data_full['artist'] = artist
    filled_data.append(artist_data_full)

time_series_data_full = pd.concat(filled_data)

#### Fiting in the Model

In [30]:
# Time series prediction for each artist
forecast_results = []

for artist in artists:
    # Filter data for the artist
    artist_data_filtered = time_series_data_full[time_series_data_full['artist'] == artist]
    artist_data_filtered.set_index('Date', inplace=True)
    ts_data = artist_data_filtered['points']
    
    # Skip artists with less than 365 data points (1 year of daily data)
    if len(ts_data) < 365:
        forecast_results.append({
            'Artist': artist,
            'Total Points 2024': 0  # Default to 0 if insufficient data
        })
        continue

    # Fit the ARIMA model
    try:
        model = ARIMA(ts_data, order=(1, 1, 1))  # Order can be adjusted (p, d, q)
        model_fit = model.fit()
        
        # Forecast for 365 days (2024)
        forecast = model_fit.forecast(steps=365)
        total_points_2024 = forecast.sum()

        forecast_results.append({
            'Artist': artist,
            'Total Points 2024': total_points_2024
        })

    except Exception as e:
        # Handle cases where ARIMA fails to converge
        forecast_results.append({
            'Artist': artist,
            'Total Points 2024': 0
        })
        print(f"ARIMA failed for {artist}: {e}")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 

In [34]:
# Evaluation
# Load dataset
artist_data = pd.read_csv('Dataset/final_data.csv')

# Aggregate data by Artist and Date
time_series_data = artist_data.groupby(['Date', 'artist'])['points'].sum().reset_index()

# Convert the 'Date' column to datetime format
time_series_data['Date'] = pd.to_datetime(time_series_data['Date'])

# Choose a single artist
artist_name = 'Drake'  
artist_data_filtered = time_series_data[time_series_data['artist'] == artist_name]

# Handle Outliers in the 'points' Column
q1 = artist_data_filtered['points'].quantile(0.25)
q3 = artist_data_filtered['points'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Replace outliers with bounds
artist_data_filtered['points'] = artist_data_filtered['points'].clip(lower=lower_bound, upper=upper_bound)

# Prepare the data for ARIMA
arima_data = artist_data_filtered.set_index('Date')['points']

# Split the data into training and testing sets
train_data = arima_data[arima_data.index < '2023-01-01']
test_data = arima_data[arima_data.index >= '2023-01-01']

# Train the ARIMA model
model = ARIMA(train_data, order=(1,1,1))
model_fit = model.fit()

# Make predictions for the testing period
forecast = model_fit.forecast(steps=len(test_data))
predictions = forecast.values
actuals = test_data.values

# Evaluate the model
mae = mean_absolute_error(actuals, predictions)
mse = mean_squared_error(actuals, predictions)
rmse = np.sqrt(mse)

# Display the metrics
metrics_table = pd.DataFrame({
    'Metric': ['Mean Absolute Error (MAE)', 'Root Mean Squared Error (RMSE)'],
    'Value': [mae, rmse]
})

print(metrics_table)

                           Metric      Value
0       Mean Absolute Error (MAE)  46.334169
1  Root Mean Squared Error (RMSE)  62.177907


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_data_filtered['points'] = artist_data_filtered['points'].clip(lower=lower_bound, upper=upper_bound)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


In [31]:
# Combine all results into a DataFrame
forecast_summary = pd.DataFrame(forecast_results)
forecast_summary = forecast_summary.sort_values(by = 'Total Points 2024',ascending = False)
# Save the forecast summary to a CSV file
#forecast_summary.to_csv('Data/ARIMA_Date.csv', index=False)

In [32]:
forecast_summary.head(20)

Unnamed: 0,Artist,Total Points 2024
145,The Weeknd,291866.930209
140,Taylor Swift,265442.699636
280,Bad Bunny,225249.303133
637,Feid,209997.596481
271,Miley Cyrus,135309.296793
290,SZA,134024.586404
131,Shakira,127026.153621
503,Arctic Monkeys,114261.010637
43,Eminem,113754.367075
255,Harry Styles,111204.516183
