# Imports

In [1]:
import os
import pandas
import nltk

In [2]:
OHCO = ["sent_num","token_num"]
gradient_cmap = 'GnBu'
bag = OHCO[:1]

# Functions

In [3]:
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):    
    # Sentences to Tokens
    # Local function to pick tokenizer
    def word_tokenize(x):
        if ws:
            s = pandas.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
        else:
            s = pandas.Series(nltk.pos_tag(nltk.word_tokenize(x)))
        return s
            
    df = doc_df.line_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df
def extract_vocabulary(TOKEN):
    try:
        del VOCAB
    except:
        pass
    TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '')

    VOCAB = TOKEN.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
        .sort_index().reset_index().rename(columns={'index':'term_str'})
    VOCAB.index.name = 'term_id'
    VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')
    sw = pandas.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
    sw = sw.reset_index().set_index('term_str')
    sw.columns = ['dummy']
    sw.dummy = 1
    VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
    VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
    return VOCAB

def add_stems(VOCAB):
    from nltk.stem.porter import PorterStemmer

    stemmer = PorterStemmer()
    VOCAB['p_stem'] = VOCAB.term_str.apply(stemmer.stem)
    return VOCAB

def pre_processing(VOCAB, TOKEN):
    if 'term_rank' not in VOCAB.columns:
        VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
        VOCAB.index.name = 'term_rank'
        VOCAB = VOCAB.reset_index()
        VOCAB = VOCAB.set_index('term_id')
        VOCAB['term_rank'] = VOCAB['term_rank'] + 1

    TOKEN = TOKEN.dropna()
    VOCAB = VOCAB.dropna()
    TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)

    VOCAB['pos_max'] = TOKEN.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
    return VOCAB, TOKEN

def create_library(list_of_songs):
    data = []
    for row in list_of_songs:
        #print(row)
        title = row[0].split(".txt")[0].split("---")[0].replace("_","").replace(",","").lower()
        artist = row[0].split(".txt")[0].split("---")[1].lower()
        year = row[2]
        song_file = row[1]
        data.append([title,artist, year,song_file, row[3] ])
    # Create the pandas DataFrame
    LIB = pandas.DataFrame(data, columns = ['title', 'artist', 'year','song_file', 'song_id'])
    LIB = LIB.set_index('song_id')
    return LIB

def generate_TFIDF( tokens_df, ocho_level, type_count, tf_method, idf_method, tf_norm_k = 0.5):

    """
    #count_method = 'n' # 'c' or 'n' # n = n tokens, c = distinct token (term) count
    #tf_method = 'sum' # sum, max, log, double_norm, raw, binary
    #tf_norm_k = .5 # only used for double_norm
    #idf_method = 'standard'
    """
    import numpy as np
    
    print('TF method: {}'.format(tf_method))
    print('IDF method: {}'.format(idf_method))

    tf_methods = ['sum', 'max', 'log', 'double_norm', 'raw', 'binary']
    if tf_method not in tf_methods:
        raise Exception("TF Method must be : {} ".format(', '.join(tf_methods)))
    
    idf_methods = ['standard', 'max', 'smooth']
    if idf_method not in idf_methods:
        raise Exception("IDF Method must be : {} ".format(', '.join(idf_method)))
        
    ### The tokens data frame to use
    TOKEN = tokens_df
    
    ### The OHCO level to use, e.g. which "bag" to use
    bag = ocho_level
    
    ### The type of count to use (e.g. binary counts are regular counts)
    count_method = type_count # 'c' or 'n' # n = n tokens, c = distinct token (term) count
    if count_method not in ['c','n']:
        raise Exception("Count Method must be 'n'(tokens) or 'c' (distinct token (term) count) ")

    ##### Create Count Matrix
    ### Bag of Words
    BOW = TOKEN.groupby(bag+['term_id']).term_id.count().to_frame().rename(columns={'term_id':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    
    ### Document-Term Matrix
    DTCM = BOW[count_method].unstack().fillna(0).astype('int')
    
    ### The type of TF to use
    #Compute TF
    if tf_method == 'sum': 
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max': 
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log': 
        TF = np.log10(1 + DTCM.T)
    elif tf_method == 'raw': 
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0]
    elif tf_method == 'binary': 
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T

    
    ### The type of IDF to use
    # Compute DF
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0]
    
    # Compute IDF
    if idf_method == 'standard': 
        IDF = np.log10(N / DF)
    elif idf_method == 'max': 
        IDF = np.log10(DF.max() / DF) 
    elif idf_method == 'smooth': 
        IDF = np.log10((1 + N) / (1 + DF)) + 1 # Correct?
    
    # Clean Up Generated DataFrames
    del BOW
    del DTCM
    
    TFIDF = TF * IDF
    return TFIDF

In [4]:
years = ["2016","2017","2018","2019","2020"]
genre = "pop"

In [5]:
# Remove Duplicate Files from Analysis
rawfiles = {}
song_id = 1001
for year in years:
    datapath = "data/{}".format(year)
    for root, dirs, files in os.walk(datapath, topdown=True):
        for name in files:
            #print(os.path.join(root, name))
            #print(name, root)
            if name not in rawfiles.keys():
                rawfiles[name] = (name, os.path.join(root, name), year, str(song_id))
                song_id = song_id + 1

In [6]:
print(len(list(rawfiles.keys())))
print(len(set(list(rawfiles.keys()))))

308
308


In [7]:
def get_files(rawfiles, genre):
    array = []
    for name in list(rawfiles.keys()):
        if genre in rawfiles[name][1]:
            array.append(rawfiles[name])
    return array

pop_files = get_files(rawfiles, "pop")
country_files = get_files(rawfiles, "country")
rnbhiphop_files = get_files(rawfiles, "rnbhiphop")
rap_files = get_files(rawfiles, "rap")

# Create Library

In [8]:
LIB_POP = create_library(pop_files)
LIB_COUNTRY = create_library(country_files)
LIB_RBHH = create_library(rnbhiphop_files)
LIB_RAP = create_library(rap_files)

In [9]:
print("Number of Pop Songs: {}".format(LIB_POP.shape[0]))
print("Number of Country Songs: {}".format(LIB_COUNTRY.shape[0]))
print("Number of R&B / Hip-Hip Songs: {}".format(LIB_RBHH.shape[0]))
print("Number of Rap Songs: {}".format(LIB_RAP.shape[0]))

Number of Pop Songs: 102
Number of Country Songs: 46
Number of R&B / Hip-Hip Songs: 102
Number of Rap Songs: 57


In [10]:
LIB_POP.head()

Unnamed: 0_level_0,title,artist,year,song_file
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1002,loveyourself,justin_bieber,2016,data/2016/pop/Love_Yourself---justin_bieber.txt
1003,treatyoubetter,shawn_mendes,2016,data/2016/pop/Treat_You_Better---shawn_mendes.txt
1004,dangerouswoman,ariana_grande,2016,data/2016/pop/Dangerous_Woman---ariana_grande.txt
1005,roses,the_chainsmokers_,2016,data/2016/pop/Roses---the_chainsmokers_.txt
1006,wedonttalkanymore,charlie_puth_,2016,data/2016/pop/We_Dont_Talk_Anymore---charlie_p...


In [11]:
LIB_COUNTRY.head()

Unnamed: 0_level_0,title,artist,year,song_file
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1057,humbleandkind,tim_mcgraw,2016,data/2016/country/Humble_And_Kind---tim_mcgraw...
1058,h.o.l.y.,florida_georgia_line,2016,data/2016/country/H.O.L.Y.---florida_georgia_l...
1059,dieahappyman,thomas_rhett,2016,data/2016/country/Die_A_Happy_Man---thomas_rhe...
1121,hurricane,luke_combs,2017,data/2017/country/Hurricane---luke_combs.txt
1122,smalltownboy,dustin_lynch,2017,data/2017/country/Small_Town_Boy---dustin_lync...


In [12]:
LIB_RBHH.head()

Unnamed: 0_level_0,title,artist,year,song_file
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1034,nolimit,usher_,2016,data/2016/rnbhiphop/No_Limit---usher_.txt
1035,sorry,beyonce,2016,data/2016/rnbhiphop/Sorry---beyonce.txt
1036,dontmind,kent_jones,2016,data/2016/rnbhiphop/Dont_Mind---kent_jones.txt
1037,seeyouagain,wiz_khalifa_,2016,data/2016/rnbhiphop/See_You_Again---wiz_khalif...
1038,oui,jeremih,2016,data/2016/rnbhiphop/Oui---jeremih.txt


In [13]:
LIB_RAP.head()

Unnamed: 0_level_0,title,artist,year,song_file
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1060,hotlinebling,drake,2016,data/2016/rap/Hotline_Bling---drake.txt
1061,jumpman,drake_,2016,data/2016/rap/Jumpman---drake_.txt
1062,679,fetty_wap_,2016,data/2016/rap/679---fetty_wap_.txt
1063,reallyreally,kevin_gates,2016,data/2016/rap/Really_Really---kevin_gates.txt
1064,controlla,drake,2016,data/2016/rap/Controlla---drake.txt


# Import file into a dataframe

In [14]:
def import_data(dataframe,genre):
    import string
    dataframes = []
    for filename in dataframe['song_file'].tolist():
        #print(filename)
        epub = open(filename, 'r').readlines()

        df = pandas.DataFrame(epub, columns=['line_str'])
        df.line_str = df.line_str.str.strip()
        df['line_str'] = df['line_str'].str.replace(r'\n', ' ').str.strip()
        df = df[~df['line_str'].str.match(r'^\s*$')]
        
        df['line_str'] = df['line_str'].apply(lambda s: s.translate(str.maketrans('', '', string.punctuation)))
        df['line_str'] = df['line_str'].apply(lambda s: s.lower())
        df.index.name = 'line_num'
        #df = df['line_str'].str.split(r"[\s',-]+", expand=True).stack()\
        #.to_frame().rename(columns={0:'token_str'})
        df['title'] = filename.split("/")[-1].split(".txt")[0].split("---")[0].replace("_","").replace(",","").lower()
        df['artist'] = filename.split("/")[-1].split(".txt")[0].split("---")[1].lower()
        df['year'] = dataframe.loc[dataframe['song_file'] == filename, 'year'].iloc[0]
        df['genre'] = genre
        #df.index.names = ["sent_num","token_num"]
        dataframes.append(df)
    return pandas.concat(dataframes)


In [15]:
df_pop = import_data(LIB_POP,'pop')
df_country = import_data(LIB_COUNTRY,'country')
df_rbhh = import_data(LIB_RBHH,'rnbhiphop')
df_rap = import_data(LIB_RAP,'rap')

In [16]:
df_pop.sample(10)

Unnamed: 0_level_0,line_str,title,artist,year,genre
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,so comfortable were living in a bubble bubble,chainedtotherhythm,katy_perry_,2017,pop
50,and the memories bring back memories bring bac...,memories,maroon_5,2020,pop
40,shouldve known your love was a game,wedonttalkanymore,charlie_puth_,2016,pop
37,ive been here all night,sidetoside,ariana_grande_,2016,pop
39,who gon pray for me,prayforme,the_weeknd_,2018,pop
47,it looks like you might be one of us,heathens,twenty_one_pilots,2016,pop
55,stay in the kitchen cookin up got your own bre...,intentions,justin_bieber_,2020,pop
31,so dont let me dont let me dont let me down,dontletmedown,the_chainsmokers_,2016,pop
23,toast to the ones here today,memories,maroon_5,2020,pop
7,are you falling,breakmyheart,dua_lipa,2020,pop


In [17]:
df_country.sample(10)

Unnamed: 0_level_0,line_str,title,artist,year,genre
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
39,if youre one of them girls,oneofthemgirls,lee_brice,2020,country
5,stealin kisses under cover babe,youmakeiteasy,jason_aldean,2018,country
22,when i taste tequila,tequila,dan_+_shay,2018,country
40,get along while we can,getalong,kenny_chesney,2018,country
24,let our bodies do the talking,alltomyself,dan_+_shay,2019,country
44,mama the nerve of this guy,diefromabrokenheart,maddie_,2020,country
4,i turn pages all the time,bluebird,miranda_lambert,2020,country
2,he said all youre really given is the sunshine...,getalong,kenny_chesney,2018,country
4,you hang your shirt on that maple limb,chasinyou,morgan_wallen,2020,country
15,what if i fall,thefighter,keith_urban_,2017,country


In [18]:
print(df_pop.shape)
print(df_country.shape)
print(df_rbhh.shape)
print(df_rap.shape)

(5931, 5)
(2011, 5)
(7092, 5)
(4188, 5)


In [19]:
TOKEN_POP = tokenize(df_pop, ws=True)
TOKEN_COUNTRY = tokenize(df_country, ws=True)
TOKEN_RBHH = tokenize(df_rbhh, ws=True)

In [20]:
TOKEN_POP = TOKEN_POP[~TOKEN_POP.pos.isin(['NNP','NNPS'])]
TOKEN_COUNTRY = TOKEN_COUNTRY[~TOKEN_COUNTRY.pos.isin(['NNP','NNPS'])]
TOKEN_RBHH= TOKEN_RBHH[~TOKEN_RBHH.pos.isin(['NNP','NNPS'])]

In [21]:
TOKEN_POP.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_tuple,pos,token_str
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,"(for, IN)",IN,for
0,1,"(all, PDT)",PDT,all
0,2,"(the, DT)",DT,the
0,3,"(times, NNS)",NNS,times
0,4,"(that, IN)",IN,that


In [22]:
TOKEN_COUNTRY.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_tuple,pos,token_str
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,"(you, PRP)",PRP,you
0,1,"(know, VBP)",VBP,know
0,2,"(theres, VBZ)",VBZ,theres
0,3,"(a, DT)",DT,a
0,4,"(light, NN)",NN,light


In [23]:
TOKEN_RBHH.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_tuple,pos,token_str
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,"(make, VB)",VB,make
0,1,"(you, PRP)",PRP,you
0,2,"(say, VBP)",VBP,say
0,3,"(uh, JJ)",JJ,uh
0,4,"(no, DT)",DT,no


In [24]:
VOCAB_POP = extract_vocabulary(TOKEN_POP)
VOCAB_POP = add_stems(VOCAB_POP)
VOCAB_POP,TOKEN_POP = pre_processing(VOCAB_POP, TOKEN_POP)

In [25]:
VOCAB_COUNTRY = extract_vocabulary(TOKEN_COUNTRY)
VOCAB_COUNTRY = add_stems(VOCAB_COUNTRY)
VOCAB_COUNTRY,TOKEN_COUNTRY = pre_processing(VOCAB_COUNTRY, TOKEN_COUNTRY)

In [26]:
VOCAB_RBHH = extract_vocabulary(TOKEN_RBHH)
VOCAB_RBHH = add_stems(VOCAB_RBHH)
VOCAB_RBHH,TOKEN_RBHH = pre_processing(VOCAB_RBHH, TOKEN_RBHH)

In [27]:
VOCAB_POP.head()

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2485,1,you,1998,0,1,you,PRP
1030,2,i,1840,0,1,i,NN
2153,3,the,1060,0,1,the,DT
1316,4,me,924,0,1,me,PRP
2209,5,to,866,0,1,to,TO


In [28]:
VOCAB_COUNTRY.head()

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
604,1,i,607,0,1,i,NN
1435,2,you,554,0,1,you,PRP
1227,3,the,488,0,1,the,DT
9,4,a,327,0,1,a,DT
32,5,and,272,0,1,and,CC


In [29]:
VOCAB_RBHH.head()

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1894,1,i,2356,0,1,i,NN
4303,2,you,1809,0,1,you,PRP
3778,3,the,1688,0,1,the,DT
60,4,a,1115,0,1,a,DT
1948,5,it,1101,0,1,it,PRP


In [30]:
TOKEN_POP.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_tuple,pos,token_str,term_str,term_id
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,"(for, IN)",IN,for,for,760
0,1,"(all, PDT)",PDT,all,all,46
0,2,"(the, DT)",DT,the,the,2153
0,3,"(times, NNS)",NNS,times,times,2205
0,4,"(that, IN)",IN,that,that,2151
0,5,"(you, PRP)",PRP,you,you,2485
0,6,"(rained, VBD)",VBD,rained,rained,1685
0,7,"(on, IN)",IN,on,on,1493
0,8,"(my, PRP$)",PRP$,my,my,1407
0,9,"(parade, NN)",NN,parade,parade,1545


In [31]:
TOKEN_COUNTRY.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_tuple,pos,token_str,term_str,term_id
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,"(you, PRP)",PRP,you,you,1435
0,1,"(know, VBP)",VBP,know,know,658
0,2,"(theres, VBZ)",VBZ,theres,theres,1231
0,3,"(a, DT)",DT,a,a,9
0,4,"(light, NN)",NN,light,light,691


In [32]:
TOKEN_POP['term_id'] = TOKEN_POP.term_str.map(VOCAB_POP.reset_index().set_index('term_str').term_id)
TOKEN_COUNTRY['term_id'] = TOKEN_COUNTRY.term_str.map(VOCAB_COUNTRY.reset_index().set_index('term_str').term_id)
TOKEN_RBHH['term_id'] = TOKEN_RBHH.term_str.map(VOCAB_RBHH.reset_index().set_index('term_str').term_id)

In [33]:
VOCAB_POP['pos_max'] = TOKEN_POP.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
VOCAB_COUNTRY['pos_max'] = TOKEN_COUNTRY.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
VOCAB_RBHH['pos_max'] = TOKEN_RBHH.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)

In [34]:
#POS_POP = TOKEN_POP.pos.value_counts().to_frame().rename(columns={'pos':'n'})
#POS_POP.index.name = 'pos_id'
#POS_POP.sort_values('n').plot.bar(y='n', figsize=(15,5), rot=45)

In [35]:

#BOW_POP = TOKEN_POP.groupby(bag+['term_id']).term_id.count().to_frame().rename(columns={'term_id':'n'})
#BOW_POP['c'] = BOW_POP.n.astype('bool').astype('int')

In [36]:
#BOW_POP

In [37]:
TFIDF_POP = generate_TFIDF(TOKEN_POP, bag, 'n',  'sum','standard' )
TFIDF_COUNTRY = generate_TFIDF(TOKEN_COUNTRY, bag, 'n',  'sum','standard' )
TFIDF_RBHH = generate_TFIDF(TOKEN_RBHH, bag, 'n',  'sum','standard' )

TF method: sum
IDF method: standard
TF method: sum
IDF method: standard
TF method: sum
IDF method: standard


In [38]:
TFIDF_POP

term_id,0,1,2,3,4,5,6,7,8,9,...,2493,2494,2495,2496,2497,2498,2499,2500,2501,2502
sent_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.001532,0.0,0.0,0.000000,0.0,0.0,0.002496,0.0,0.0,0.000747,...,0.0,0.0,0.001493,0.0,0.001331,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.000000,0.0,0.0,0.002908,0.0,0.0,0.001414,...,0.0,0.0,0.000000,0.0,0.001260,0.001718,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.002529,0.0,0.0,0.002095,0.0,0.0,0.001358,...,0.0,0.0,0.001358,0.0,0.001211,0.000000,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.000000,0.0,0.0,0.001893,0.0,0.0,0.001339,...,0.0,0.0,0.000000,0.0,0.001193,0.001626,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.000000,0.0,0.0,0.002701,0.0,0.0,0.000750,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
105,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
106,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
107,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0


In [39]:
TFIDF_COUNTRY

term_id,0,1,2,3,4,5,6,7,8,9,...,1434,1435,1436,1437,1438,1439,1440,1441,1442,1443
sent_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005938,...,0.000000,0.004298,0.0,0.00378,0.001393,0.000955,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005203,...,0.000000,0.002581,0.0,0.00000,0.003231,0.000886,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003243,...,0.003716,0.003231,0.0,0.00000,0.003424,0.001408,0.0,0.0,0.002863,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003886,...,0.000000,0.001625,0.0,0.00000,0.003730,0.001534,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003130,...,0.000000,0.001714,0.0,0.00000,0.002833,0.001294,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.0
81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010407,...,0.000000,0.007976,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.0


In [40]:
VOCAB_POP_q1 = VOCAB_POP.copy(deep=True)
VOCAB_POP_q1['TFIDF_mean'] = TFIDF_POP[TFIDF_POP > 0].mean().fillna(0) 
VOCAB_POP_q1['TFIDF_sum'] = TFIDF_POP.sum()
VOCAB_POP_q1['TFIDF_median'] = TFIDF_POP[TFIDF_POP > 0].median().fillna(0) 
VOCAB_POP_q1['TFIDF_max'] = TFIDF_POP.max()

VOCAB_POP_q1[['term_rank','term_str','pos_max','TFIDF_sum']] \
.sort_values('TFIDF_sum', ascending=False).head(20).style.background_gradient(cmap=gradient_cmap)


Unnamed: 0_level_0,term_rank,term_str,pos_max,TFIDF_sum
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
37,261,ahha,NN,0.554869
971,432,heyhey,NN,0.376364
1139,14,know,VBP,0.320475
2147,179,thank,NN,0.243607
2485,1,you,PRP,0.237554
1075,131,isnt,VBZ,0.224667
1437,119,next,JJ,0.220956
1110,411,karat,NN,0.215556
1281,475,magic,NN,0.215311
0,362,24,CD,0.202075


In [41]:
VOCAB_COUNTRY_q1 = VOCAB_COUNTRY.copy(deep=True)
VOCAB_COUNTRY_q1['TFIDF_mean'] = TFIDF_COUNTRY[TFIDF_COUNTRY > 0].mean().fillna(0) 
VOCAB_COUNTRY_q1['TFIDF_sum'] = TFIDF_COUNTRY.sum()
VOCAB_COUNTRY_q1['TFIDF_median'] = TFIDF_COUNTRY[TFIDF_COUNTRY > 0].median().fillna(0) 
VOCAB_COUNTRY_q1['TFIDF_max'] = TFIDF_COUNTRY.max()

VOCAB_COUNTRY_q1[['term_rank','term_str','pos_max','TFIDF_sum']] \
.sort_values('TFIDF_sum', ascending=False).head(20).style.background_gradient(cmap=gradient_cmap)

Unnamed: 0_level_0,term_rank,term_str,pos_max,TFIDF_sum
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1432,29,yeah,NN,0.664596
468,191,glasses,NNS,0.470431
1251,122,through,IN,0.38732
1381,86,whiskey,NN,0.318837
357,64,em,NN,0.318822
1416,98,world,NN,0.315795
604,1,i,NN,0.277801
1033,66,see,VB,0.275568
612,163,ima,NNS,0.268084
1080,425,sip,VB,0.265163


In [42]:
VOCAB_RBHH_q1 = VOCAB_RBHH.copy(deep=True)
VOCAB_RBHH_q1['TFIDF_mean'] = TFIDF_RBHH[TFIDF_RBHH > 0].mean().fillna(0) 
VOCAB_RBHH_q1['TFIDF_sum'] = TFIDF_RBHH.sum()
VOCAB_RBHH_q1['TFIDF_median'] = TFIDF_RBHH[TFIDF_RBHH > 0].median().fillna(0) 
VOCAB_RBHH_q1['TFIDF_max'] = TFIDF_RBHH.max()

VOCAB_RBHH_q1[['term_rank','term_str','pos_max','TFIDF_sum']] \
.sort_values('TFIDF_sum', ascending=False).head(20).style.background_gradient(cmap=gradient_cmap)

Unnamed: 0_level_0,term_rank,term_str,pos_max,TFIDF_sum
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3838,188,tiimmy,NN,1.755362
4098,214,walkin,NN,0.63135
3960,442,turner,NN,0.602546
2052,236,kill,VB,0.554099
1263,205,everybody,NN,0.542356
4100,567,wallet,NN,0.518531
489,127,break,VB,0.510928
4204,498,wildin,NN,0.472904
342,282,bet,NN,0.454525
1541,340,furnace,NN,0.405749
