# scrap the website 

In [1]:
# for this lab, the chosen website is streamsquid.com

In [2]:
# for the first part of the task, we only import the necessary libraries for Py web scraping 
import os 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [3]:
# send get request and retrieve content 
url = 'https://streamsquid.com/#/browse/newrel'
response = requests.get(url)
content = response.content

In [4]:
# parse content 
soup = BeautifulSoup(content, 'html.parser')

# find the container 
container = soup.find('div', class_="s-scroller")
container

In [5]:
# main-search-box-container

In [15]:
container = soup.find('div', class_="queue-filter-input-wrapper")
container

<div class="queue-filter-input-wrapper">
<input autocomplete="off" class="filter-form-control" id="queue-filter-box-container" onblur="this.placeholder ='filter'" onfocus="this.placeholder = ''" placeholder="filter" type="text"/>
<div id="queue-filter-loading-gif"> </div>
<div id="queue-filter-clear" title="clear text"> </div>
<div class="fa fa-filter" id="queue-filter-icon-const"> </div>
</div>

In [16]:
# input-wrapper

In [17]:
# the name for the csv file where I will store the output will be "playlist.csv"
playlist = "playlist.csv"

# retrieve the output in csv format 

In [18]:
# in the search box, we can perform searches by song title, artist and album, this is the information that we extract 
headers = ["Song Title", "Artist", "Year", "Album", "Duration", 'tempo', 'pitch', 'energy', 'danceability']
with open(playlist, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(headers)
    writer.writerows(playlist)

# start working with the dataset 

In [19]:
# read the dataset 

In [20]:
playlist = pd.read_csv('playlist.csv')
playlist.head()


Unnamed: 0,Song Title,Artist,Year,Album,Duration,tempo,pitch,energy,danceability
0,p,,,,,,,,
1,l,,,,,,,,
2,a,,,,,,,,
3,y,,,,,,,,
4,l,,,,,,,,


In [21]:
# In the dataset provided, we gather information on the song name, artist, release year, album and duration 

In [22]:
playlist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Song Title    12 non-null     object 
 1   Artist        0 non-null      float64
 2   Year          0 non-null      float64
 3   Album         0 non-null      float64
 4   Duration      0 non-null      float64
 5   tempo         0 non-null      float64
 6   pitch         0 non-null      float64
 7   energy        0 non-null      float64
 8   danceability  0 non-null      float64
dtypes: float64(8), object(1)
memory usage: 992.0+ bytes


In [23]:
# get summary statistics from the dataset
print(playlist.describe())

       Artist  Year  Album  Duration  tempo  pitch  energy  danceability
count     0.0   0.0    0.0       0.0    0.0    0.0     0.0           0.0
mean      NaN   NaN    NaN       NaN    NaN    NaN     NaN           NaN
std       NaN   NaN    NaN       NaN    NaN    NaN     NaN           NaN
min       NaN   NaN    NaN       NaN    NaN    NaN     NaN           NaN
25%       NaN   NaN    NaN       NaN    NaN    NaN     NaN           NaN
50%       NaN   NaN    NaN       NaN    NaN    NaN     NaN           NaN
75%       NaN   NaN    NaN       NaN    NaN    NaN     NaN           NaN
max       NaN   NaN    NaN       NaN    NaN    NaN     NaN           NaN


# summary of csv dataset operations 

Exploratory Data Analysis (EDA): Perform EDA to understand the structure, characteristics, and relationships within the dataset. Some common tasks include examining the dimensions of the dataset, checking for missing values, exploring data distributions, and visualizing patterns and relationships using plots or summary statistics.

Data Preprocessing: Prepare the dataset for machine learning algorithms by handling missing data, encoding categorical variables, normalizing or scaling numerical features, and handling outliers. Pandas provides functions to handle missing values (fillna()), encode categorical variables (get_dummies()), and perform various transformations.

Feature Selection/Engineering: Select or engineer relevant features that can contribute to the machine learning task. This may involve dropping irrelevant or highly correlated features, creating new features through mathematical operations or domain knowledge, or extracting information from text or images.

Splitting the Dataset: Split the dataset into training and testing sets. The training set is used to train the machine learning model, while the testing set is used to evaluate its performance. Scikit-learn provides functions like train_test_split() to split the dataset into appropriate proportions.

Model Training: Choose a suitable machine learning algorithm based on your task (classification, regression, clustering, etc.) and train the model using the training data. Scikit-learn provides various algorithms like decision trees, random forests, support vector machines (SVM), and neural networks, among others. Use the fit() function to train the model.

# exploratory data analysis 

Check the dimensions of the dataset (number of rows and columns) using the shape attribute.
Inspect the column names using the columns attribute.
Check the data types of the columns using the dtypes attribute.

In [None]:
shape = playlist.shape
print(shape)

In [None]:
playlist.columns

In [None]:
playlist.dtypes

In [None]:
# python machine learning libraries 

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# preprocessing 

handling missing data, encoding categorical variables, normalizing or scaling numerical features, and handling outliers. Pandas provides functions to handle missing values (fillna()), encode categorical variables (get_dummies()), and perform various transformations.

In [None]:
# handling missing data

In [None]:
playlist.isnull().any()

In [None]:
# encoding categorical variables

In [None]:
# Load the playlist dataset
playlist = pd.read_csv("playlist.csv")

# One-Hot Encoding
one_hot_encoded = pd.get_dummies(playlist[["Artist", "Album"]])
one_hot_encoded_playlist = pd.concat([playlist, one_hot_encoded], axis=1)
print("One-Hot Encoded Playlist:")
print(one_hot_encoded_playlist)

# Label Encoding
label_encoder = LabelEncoder()
playlist["Artist_Label"] = label_encoder.fit_transform(playlist["Artist"])
playlist["Album_Label"] = label_encoder.fit_transform(playlist["Album"])
print("\nLabel Encoded Playlist:")
print(playlist)

In [None]:
# Streamlit data visualisation 

In [None]:
# in this section, we will deploy yhe streamlit app with the user input

In [None]:
# we install the required libraries 

In [None]:
import streamlit as st
st.set_page_config(page_title="Song Recommendation", layout="wide")
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import streamlit.components.v1 as components

In [None]:
# summary and highlights of key findings 

In [None]:
streamlit run app.py

In [None]:
# we want to build a recommender that uses as input variables: tempo, pitch, energy, danceability, music_genre 