<h1 style="text-align:center;color:red;">Wilco</h1>
<p style="text-align:center;">Por Maycon Cypriano Batestin</p>


### About the Dataset

The objective of this project is to analyze the lyrics of the band WILCO (or in this case any other band with a long history) throughout their career and be able to predict when, what and how the group's next songs will be. Using machine learning and NLP

- **Fonte original:** Spotify
- **Libertado por:** Maycon Batestin
- **Licença:** Creative Commons Attribution-ShareAlike 4.0 International ([CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/))



<h1 style="text-align:center;color:red;">Glossary</h1>


Fields	                                                  | Type  	  |    Description                              |
----------------------------------------------------------|:---------:|:-------------------------------------------:|
artist                            						  |string     | name of artist                               |
album                                                     |string     | name of album                                |
track													  |string     | name of a song belong to the album           |
year													  |int        | year of release of the album                 |
lyrics													  |string     | the lyrics about the song                    |





<h1 style="text-align:center;color:red;">Getting the Dataset </h1>


In [None]:
artist = 'Wilco'
artist = artist.replace(" ","_")

In [None]:
!clear
!python /Users/mayconcyprianobatestin/Documents/repositorios/DATA_SCIENCE/MUSIC/scripts/create_dataset.py $artist


<h1 style="text-align:center;color:red;">Librarys </h1>


In [None]:
### Librarys

import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud
import nltk 
from nltk import tokenize, RSLPStemmer
import matplotlib.pyplot as plt
from string import punctuation
import unidecode
import re



<h1 style="text-align:center;color:red;">Exploratory analysis </h1>


In [None]:
path = f'/Users/mayconcyprianobatestin/Documents/repositorios/DATA_SCIENCE/MUSIC/dataset/dataset_{artist.lower()}.csv'
df = pd.read_csv(path)

df.head(100)

In [None]:
#check for NA value

def checkNAN(df):
    if df.isnull().values.any():
        df.dropna(inplace=True) 
        df.reset_index(drop=True, inplace=True)
        print("Checking for NaN values and fixing!.")
    else:
        print("There no NaN values on your dataset")

checkNAN(df)




In [None]:
# checking for duplicates

def remove_duplicates_from_dataframe(df):
    duplicates = df[df.duplicated()]
    df_no_duplicates = df.drop_duplicates()
    df.update(df_no_duplicates)

    return df

remove_duplicates_from_dataframe(df)

In [None]:
#fixing the colun lyrics 

def clean_lyrics(text):
    cleaned_text = re.sub(r'\[[^\]]+\]', '', text)  
    cleaned_text = re.sub(r'\d+ Contributors', '', cleaned_text)  
    cleaned_text = re.sub(r'\\n', ' ', cleaned_text)  
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = re.sub(r'^.*Lyrics', ' ', cleaned_text)   
    return cleaned_text.strip() 

df['lyrics'] = df['lyrics'].apply(clean_lyrics)

In [None]:
def album_years(df):
    try:
        fig = px.scatter(df, x="year", color="album", symbol="album")
        fig.update_traces(marker_size=10)
        fig.update_layout(xaxis=dict(type='category'))
        fig.update_xaxes(tickangle=45)
        fig.update_layout(title_text='Albums over the years')
        fig.update_yaxes(tickangle=45, showticklabels=False, title_text='')

        return fig.show()
    except:
        return "Somenthing wrong with your dataframe"

album_years(df[['album', 'year']] )

In [None]:
def track_album_years(df):
    try: 
        album_track_counts = df.groupby(['album', 'year']).size().reset_index(name='num_tracks')
        line = px.bar(album_track_counts, y='num_tracks', color='album')
        line.update_layout(xaxis=dict(type='category'))
        line.update_xaxes(tickangle=45, showticklabels=False, title_text=' ')
        line.update_layout(title_text='Track from album, over the year')
        line.update_yaxes(tickangle=45, showticklabels=False, title_text='')
        for i, count in enumerate(album_track_counts['num_tracks']):
            line.add_annotation(text=count, x=album_track_counts.index[i], y=count)
        line.update_traces(textposition='outside')
        line.add_annotation(
            text="Number of Tracks",
            xref="paper",
            yref="paper",
            x=0.5,  
            y=-0.1, 
            showarrow=False
        )
        
        line.show()
    except Exception as e:
        return f"Something went wrong: {str(e)}"

track_album_years(df[['album', 'track', 'year']])



In [None]:
def word_cloud_album(df):
    
    try:
        albums_unique = df['album'].unique()
        common_settings = {
            "background_color": "black",
            "font_path": "/Users/mayconcyprianobatestin/Documents/repositorios/DATA_SCIENCE/IMDB/dataset/LucidaGrande.ttc",
            "max_font_size": 110,
            "width": 700,
            "height": 300,
            "collocations": False,
        }

        num_albums = len(albums_unique)

        fig = make_subplots(rows=num_albums, cols=1, subplot_titles=albums_unique)

        for i, album in enumerate(albums_unique):
            album_lyrics = df[df['album'] == album]['lyrics']
            list_of_words = ' '.join([text for text in album_lyrics])

            wc = WordCloud(**common_settings)
            wc.generate(list_of_words)

            wordcloud_img = wc.to_image()
            fig.add_trace(go.Image(z=wordcloud_img), row=i+1, col=1)

            fig.update_layout(height=600 * num_albums, width=800, title_text="Word Clouds for Albumn")
            fig.update_layout(coloraxis_showscale=False)
            fig.update_xaxes(showticklabels=False)
            fig.update_yaxes(showticklabels=False)
        return fig.show()
    except:
        return "There somenthing wrong with your dataset"

word_cloud_album(df)


In [None]:
vector = CountVectorizer()
bag = vector.fit_transform(df.lyrics)
train, test, class_train, class_test = train_test_split(bag, df.lyrics, test_size=0.2, random_state=42)
regres_logistc = LogisticRegression(max_iter=1000)
regres_logistc.fit(train, class_train)
acuracy = regres_logistc.score(test, class_test)
final = {"acuracy": acuracy}
graph = pd.DataFrame.from_dict(final, orient='index', columns=['Value'])
fig = go.Figure()
fig.add_trace(go.Indicator(
        mode="number+gauge+delta",
        value=graph['Value'][0],
        title={'text': "acuracy"},
        domain={'row': 0, 'column': 0}
    ))
fig.update_layout(
        title="Contagem de Acuracia",
        height=300,
    )
fig.update_traces(uirevision="top center")

fig.show() 