In [None]:
from google.colab import drive
import pandas as pd


# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from fastai.collab import CollabDataLoaders, collab_learner
import matplotlib.pyplot as plt
from fastai.tabular.all import RandomSplitter
from fastai.collab import *
from fastai.tabular.all import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim


In [None]:
music_info_data = pd.read_csv('/content/drive/My Drive/Spotify_Dataset/Music_Info.csv')
listening_history_data = pd.read_csv('/content/drive/My Drive/Spotify_Dataset/User_Listening_History.csv')

In [None]:
listening_history_merged = pd.merge(listening_history_data, music_info_data[['track_id', 'name','artist', 'genre', 'year', 'duration_ms',
                                                           'danceability', 'energy', 'loudness', 'speechiness',
                                                           'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']],
                             on='track_id', how='left')

In [None]:
# Add a new column "song" to raw dataframe with track name, artist and year.
listening_history_merged["song"] = listening_history_merged["name"] + ", " + listening_history_merged["artist"] + ", " + listening_history_merged["year"].astype(str)


In [None]:
listening_history_merged['playcount_normalized'] = 1

In [None]:
listening_history_merged.rename(columns={'user_id': 'user', 'song': 'item', 'playcount': 'rating'}, inplace=True)


In [None]:
def get_model_df(listening_history_merged, num_users=10000):
    # Get a unique list of all user_ids in the DataFrame
    all_users = listening_history_merged['user'].unique()
    print(all_users[:20])
    # Ensure num_users does not exceed the length of all_users
    num_users = min(num_users, len(all_users))
    print(num_users)
    # Get the first num_users from all_users
    first_users = all_users[:num_users]
    print(first_users)
    # Filter the DataFrame to include only selected users
    filtered_dataframe = listening_history_merged[listening_history_merged['user'].isin(first_users)]

    # Return the filtered DataFrame
    return filtered_dataframe

In [None]:
listening_history_filtered = get_model_df(listening_history_merged, 10000)


['b80344d063b5ccb3212f76538f3d9e43d87dca9e'
 '85c1f87fea955d09b4bec2e36aee110927aedf9a'
 'bd4c6e843f00bd476847fb75c47b4fb430a06856'
 '969cc6fb74e076a68e36a04409cb9d3765757508'
 '4bd88bfb25263a75bbdd467e74018f4ae570e5df'
 'e006b1a48f466bf59feefed32bec6494495a4436'
 '9d6f0ead607ac2a6c2460e4d14fb439a146b7dec'
 '9bb911319fbc04f01755814cb5edb21df3d1a336'
 'b64cdd1a0bd907e5e00b39e345194768e330d652'
 '17aa9f6dbdf753831da8f38c71b66b64373de613'
 'd6589314c0a9bcbca4fee0c93b14bc402363afea'
 '5a905f000fc1ff3df7ca807d57edb608863db05d'
 'c737ec8c1b16ce8e39115f4432c9a7fc21ec47a1'
 '45544491ccfcdc0b0803c34f201a6287ed4e30f8'
 'ed7d4c476013b1c3dd91982b61494bf7436083ba'
 'baf47ed8da24d607e50d8684cde78b923538640f'
 '169f9f4c68b62d1887c7c0ac99d10a79cfca5daf'
 'bd8475385f0aa78830fa6dfce9e7242164b035c8'
 '0afaa5d9d04bf85af720fe8cc566a41ca3e41c97'
 '403b3b867fc71dfdcc12652f30e88bdc7ccd9aa4']
10000
['b80344d063b5ccb3212f76538f3d9e43d87dca9e'
 '85c1f87fea955d09b4bec2e36aee110927aedf9a'
 'bd4c6e843f00bd476847fb7

In [None]:
len(listening_history_filtered)

103827

In [None]:
listening_history_merged_lstm = listening_history_filtered

In [None]:
selected_features = ['user', 'track_id', 'rating', 'year']
listening_history_selected = listening_history_merged_lstm[selected_features]

In [None]:
# Encode user_id and track_id to integers using .loc
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
listening_history_merged_lstm.loc[:, 'user'] = user_encoder.fit_transform(listening_history_merged_lstm['user'])
listening_history_merged_lstm.loc[:, 'item'] = item_encoder.fit_transform(listening_history_merged_lstm['track_id'])


In [None]:
def create_sequences(data, seq_length):
    sequences = []
    for user_id, user_data in data.groupby('user'):
        items = user_data['item'].tolist()
        for i in range(len(items) - seq_length):
            sequences.append((items[i:i+seq_length], items[i+seq_length]))
    return sequences


In [None]:
listening_history_merged_lstm.tail()

Unnamed: 0,track_id,user,rating,name,artist,genre,year,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,item,rating.1,playcount_normalized
103822,TRCQUWE128F427EF8F,7621,1,The Sporting Life,The Decemberists,Rock,2005,278493,0.651,0.807,-6.636,0.0624,0.107,0.0013,0.104,0.877,106.169,1648,1,1
103823,TRBPLGD128F429EB58,7621,1,Streets of Fire,The New Pornographers,Rock,2005,161600,0.676,0.571,-7.129,0.0341,0.00798,1e-06,0.656,0.346,122.144,1072,1,1
103824,TROMNVI128F14979B9,7621,1,Capturing Moods,Rilo Kiley,Rock,2002,215133,0.249,0.624,-6.148,0.0374,0.0125,0.00362,0.0897,0.326,82.24,8895,1,1
103825,TRMVAFP128F14563A9,7621,1,Daniel Cowman,Regina Spektor,,2004,290133,0.567,0.262,-7.88,0.0462,0.958,6.7e-05,0.0788,0.204,100.116,7985,1,1
103826,TRUDTQJ128F4286630,7621,2,Summer Hair = Forever Young,The Academy Is...,Rock,2008,218453,0.496,0.923,-3.066,0.0576,0.000226,0.0,0.128,0.563,135.907,12112,1,1


In [None]:
seq_length = 10
sequences = create_sequences(listening_history_merged_lstm, seq_length)


In [None]:
train_sequences, test_sequences = train_test_split(sequences, test_size=0.2, random_state=42)


In [None]:
# Define a custom dataset class
class SequenceDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence, dtype=torch.long), torch.tensor(target, dtype=torch.long)


In [None]:
train_dataset = SequenceDataset(train_sequences)
test_dataset = SequenceDataset(test_sequences)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, num_items, embedding_dim, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_items, embedding_dim)
        self.lstm_layers = nn.ModuleList([nn.LSTM(embedding_dim, hidden_dim, batch_first=True)])

        # Add additional LSTM layers
        for _ in range(1, num_layers):
            self.lstm_layers.append(nn.LSTM(hidden_dim, hidden_dim, batch_first=True))

        self.fc = nn.Linear(hidden_dim, num_items)

    def forward(self, x):
        x = self.embedding(x)
        for lstm in self.lstm_layers:
            x, _ = lstm(x)
        x = x[:, -1, :]  # Take the last timestep
        x = self.fc(x)
        return x


In [None]:
# Parameters
num_items = len(item_encoder.classes_)
embedding_dim = 50
hidden_dim = 100
num_layers = 2  # Increase the number of layers as desired

# Create model instance
model = LSTMModel(num_items, embedding_dim, hidden_dim, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.015)


In [None]:
import torch.nn.functional as F

def train_model(model, train_loader, criterion, optimizer, num_epochs=60):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total_samples = 0
        for sequences, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(F.softmax(outputs, dim=1), 1)
            correct += (predicted == targets).sum().item()
            total_samples += targets.size(0)

        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = correct / total_samples
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}, Accuracy: {epoch_accuracy}')


In [None]:
train_model(model, train_loader, criterion, optimizer)

Epoch 1/60, Loss: 9.003724151187473, Accuracy: 0.0050803015404785315
Epoch 2/60, Loss: 7.989282482200199, Accuracy: 0.012127171419206818
Epoch 3/60, Loss: 7.183472984366947, Accuracy: 0.02430896973669835
Epoch 4/60, Loss: 6.394787765211529, Accuracy: 0.043319130339779306
Epoch 5/60, Loss: 5.655232482486301, Accuracy: 0.07500273134491424
Epoch 6/60, Loss: 4.945734302202861, Accuracy: 0.1252594777668524
Epoch 7/60, Loss: 4.303379009167354, Accuracy: 0.18764339560799737
Epoch 8/60, Loss: 3.7407327045996985, Accuracy: 0.25983284169124876
Epoch 9/60, Loss: 3.348598720298873, Accuracy: 0.31369496339997816
Epoch 10/60, Loss: 2.947864199678103, Accuracy: 0.3757784333005572
Epoch 11/60, Loss: 2.5800452629725137, Accuracy: 0.4371244400742926
Epoch 12/60, Loss: 2.366687964234087, Accuracy: 0.4723041625696493
Epoch 13/60, Loss: 2.193494249549177, Accuracy: 0.5023489566262428
Epoch 14/60, Loss: 1.9073164090514183, Accuracy: 0.56047197640118
Epoch 15/60, Loss: 1.7640730183985498, Accuracy: 0.5864197

In [None]:
# Save the model
torch.save(model.state_dict(), 'lstm_model.pth')



In [None]:
# Load the model
model = LSTMModel(num_items, embedding_dim, hidden_dim, num_layers)
model.load_state_dict(torch.load('lstm_model.pth'))
model.eval()


LSTMModel(
  (embedding): Embedding(15676, 50)
  (lstm_layers): ModuleList(
    (0): LSTM(50, 100, batch_first=True)
    (1): LSTM(100, 100, batch_first=True)
  )
  (fc): Linear(in_features=100, out_features=15676, bias=True)
)

In [None]:
num_items = len(item_encoder.classes_)
padding_value= num_items

[]


In [None]:
import torch.nn.functional as F

def recommend_songs(model, user_id, top_k=10, sequence_length=10):
    # Get user interactions
    user_items = listening_history_merged_lstm[listening_history_merged_lstm['user'] == user_id]['item'].tolist()

    # Check if the user has any interaction history
    if not user_items:
        print("User has no interaction history.")
        return []

    # If the user has fewer interactions than required sequence length, pad the sequence
    if len(user_items) < sequence_length:
        user_items = [padding_value] * (sequence_length - len(user_items)) + user_items
    else:
        user_items = user_items[-sequence_length:]

    # Convert to tensor
    user_sequence = torch.tensor(user_items, dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        output = model(user_sequence)
        probabilities = F.softmax(output, dim=1)
        top_items = torch.topk(probabilities, top_k, dim=1).indices.squeeze().tolist()

    recommended_songs = item_encoder.inverse_transform(top_items)
    return recommended_songs




In [None]:
user_id = 7621  # Example user ID
recommended_songs = recommend_songs(model, user_id, top_k=10)
print("Recommended Songs:", recommended_songs)

Recommended Songs: ['TRJSAID128F934D596' 'TRRSBHA128F425E8B5' 'TRCKQXC128F42796BF'
 'TRIQNAM128F4259B52' 'TRQMAJC128F4285821' 'TRFEZLR128F92C5A85'
 'TRHGOPX128F4283D1F' 'TRLVRBB128E0781E8E' 'TRXOZJJ128F14784C9'
 'TRVJWQC128F92FD55F']


In [None]:
# Get unique song names with artists and years based on recommended track IDs
recommended_song_info = listening_history_merged_lstm[listening_history_merged_lstm['track_id'].isin(recommended_songs)][['name', 'artist', 'year']].drop_duplicates()

print("Recommended Song Names with Artist and Year:")
for index, row in recommended_song_info.iterrows():
    print(f"{row['name']} by {row['artist']} ({row['year']})")


Recommended Song Names with Artist and Year:
Courtship Dating by Crystal Castles (2008)
Creepin Up The Backstairs by The Fratellis (2007)
Love Dog by TV on the Radio (2008)
Karmacoma by Massive Attack (1998)
The Cosmic Game by Thievery Corporation (2005)
Beautiful Drug by Thievery Corporation (2008)
Skeleton Boy by Friendly Fires (2008)
English Civil War by The Clash (2008)
Neighborhoods by Matthew Dear (2007)
007 by Ленинград (2000)
