In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
data = pd.read_excel('/content/drive/MyDrive/country_lyrics.xlsx')

In [4]:
data.head()

Unnamed: 0,Song,Artist,Featuring,Entered Top 30 In:,Lyrics,Writers,Producer,Rough Order,Unnamed: 8,RULES!!!! Top 30 Country Airplay From March 2014-March 2017
0,Wake Up Lovin You,Craig Morgan,,2013,My friends call me up 'cause they know I'm dow...,"Josh Osborne, Matthew Ramsey, Trevor Rosen","Craig Morgan, Phil O'Donnell",1,,
1,Young Love,Kip Moore,,2013,"Your daddy thought I was wrong for you, Thinki...","Kip Moore, Dan Couch, Westin Davis",Brett James,2,,
2,Beat of the Music,Brett Eldredge,,2013,Well I just met you a couple hours ago My last...,"Brett Eldredge, Ross Copperman, Heather Morgan","Brett Eldredge, Ross Copperman",3,,
3,The Heart of Dixie,Danielle Bradbery,,2013,She had a dead-end job at the National Bank An...,"Brett James, Caitlyn Smith, Troy Verges",Brett James,4,,
4,Everything I Shouldn't Be Thinking About,Thompson Square,,2013,My motorcycle needs an oil change Fence needs ...,"Keifer Thompson, Shawna Thompson, David Lee Mu...","RIch Redmond, Kurt Allison, Tully Kennedy, Dav...",5,,


In [5]:
data.columns

Index(['Song', 'Artist', 'Featuring', 'Entered Top 30 In:', 'Lyrics',
       'Writers', 'Producer', 'Rough Order', 'Unnamed: 8',
       'RULES!!!! Top 30 Country Airplay From March 2014-March 2017'],
      dtype='object')

In [6]:
dataset = data.iloc[:,:8]

In [7]:
dataset[['Artist','Song']].groupby(['Artist']).count().sort_values(by='Song',ascending=False).reset_index().head(10)

Unnamed: 0,Artist,Song
0,Luke Bryan,16
1,Blake Shelton,15
2,Thomas Rhett,14
3,Jason Aldean,14
4,Florida Georgia Line,13
5,Keith Urban,12
6,Tim McGraw,11
7,Kenny Chesney,11
8,Eric Church,11
9,Cole Swindell,10


In [8]:
dataset[['Producer','Song']].groupby(['Producer']).count().sort_values(by='Song',ascending=False).reset_index().head(10)

Unnamed: 0,Producer,Song
0,Dann Huff,31
1,Jay Joyce,27
2,Joey Moi,26
3,Scott Hendricks,23
4,Michael Knox,14
5,Ross Copperman,11
6,"Jeff Stevens, Jody Stevens",11
7,Michael Carter,10
8,Shane McAnally,9
9,busbee,8


In [9]:
print(len(dataset.Producer.unique()))
print(len(dataset.Artist.unique()))

180
121


In [10]:
dataset['temp'] = dataset.Producer.str.split(',')
dataset_prod = (dataset.set_index(['Song', 'Artist', 'Featuring', 'Entered Top 30 In:', 'Lyrics','Writers', 'Producer', 'Rough Order'])['temp'].apply(pd.Series).stack().reset_index().drop('level_8', axis=1).rename(columns={0:'all_producers'}))

In [11]:
dataset_prod[['all_producers','Song']].groupby(['all_producers']).count().sort_values(by='Song',ascending=False).reset_index().head(10)

Unnamed: 0,all_producers,Song
0,Dann Huff,51
1,Joey Moi,28
2,Jay Joyce,27
3,Scott Hendricks,24
4,Ross Copperman,22
5,Jeff Stevens,15
6,Michael Knox,14
7,Zach Crowell,13
8,Byron Gallimore,12
9,busbee,12


In [12]:
data_lyrics = data.iloc[:,4:5]
data_lyrics.head()

Unnamed: 0,Lyrics
0,My friends call me up 'cause they know I'm dow...
1,"Your daddy thought I was wrong for you, Thinki..."
2,Well I just met you a couple hours ago My last...
3,She had a dead-end job at the National Bank An...
4,My motorcycle needs an oil change Fence needs ...


In [13]:
import re
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning function
def clean_lyrics(lyrics, bigrams=False, stemm=False):
    lyrics = lyrics.lower() # lower case
    lyrics = re.sub('['+my_punctuation + ']+', ' ', lyrics) # strip punctuation
    lyrics = re.sub('\s+', ' ', lyrics) #remove double spacing
    lyrics = re.sub('([0-9]+)', '', lyrics) # remove numbers
    lyrics_token_list = [word for word in lyrics.split(' ') if word not in my_stopwords] # remove stopwords

    if stemm:
      lyrics_token_list = [word_rooter(word) if '#' not in word else word for word in lyrics_token_list] # apply word rooter
    
    if bigrams:
        lyrics_token_list = lyrics_token_list+[lyrics_token_list[i]+'_'+lyrics_token_list[i+1]
                                            for i in range(len(lyrics_token_list)-1)]
    lyrics = ' '.join(lyrics_token_list)
    return lyrics

In [15]:
data_lyrics['Lyrics'] = data_lyrics.Lyrics.apply(clean_lyrics)

In [16]:
data_lyrics.head()

Unnamed: 0,Lyrics
0,friends call cause know take paint town help g...
1,daddy thought wrong thinking back mom two wron...
2,well met couple hours ago last night town hey ...
3,dead end job national bank deadbeat husband al...
4,motorcycle needs oil change fence needs mendin...


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=0.1, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(data_lyrics['Lyrics']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [18]:
tf.shape

(484, 133)

In [19]:
len(tf_feature_names)

133

In [21]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 3
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=10, batch_size=32)

In [22]:
model.fit(tf)

LatentDirichletAllocation(batch_size=32, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=10, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [23]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [24]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,like,636.6,got,627.1,love,580.9
1,oh,535.0,baby,442.3,wanna,521.5
2,every,418.1,girl,428.2,yeah,513.6
3,know,325.3,back,425.0,like,486.6
4,time,319.6,good,383.6,little,449.1
5,let,274.4,take,356.9,make,433.7
6,little,234.8,one,352.4,know,341.0
7,get,200.7,yeah,330.0,get,340.2
8,still,199.7,right,297.0,gonna,339.4
9,cause,181.2,gonna,262.8,tonight,303.2


In [94]:
model.

0.3333333333333333