In [1]:
import sqlite3
import numpy as np
import pandas as pd

In [2]:
# First generate stemmed English dictionary

# First load top 5000 most common English words
english = pd.read_csv('top5000.txt', sep='\t', header=None, names=["word","pos","count","dispersion"], skipinitialspace=True)

In [3]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [4]:
english.head()

Unnamed: 0,word,pos,count,dispersion
1,the,a,22038615,0.98
2,be,v,12545825,0.97
3,and,c,10741073,0.99
4,of,i,10343885,0.97
5,a,a,10144200,0.98


In [5]:
# Stem words
english['stem'] = english['word'].map(lambda x: stemmer.stem(x.lower()))

In [6]:
# Aggregate stems that correspond to multiple words
english = english[['count','stem']].groupby("stem").sum().reset_index()

In [7]:
# We see that 1284 words mapped to duplicate stems
english.shape

(3716, 2)

In [9]:
total_words = sum(english['count'])

In [10]:
total_words

329794508

In [11]:
english['p_english'] = english['count'].map(lambda x: x*1.0/total_words)

In [12]:
english.head()

Unnamed: 0,stem,count,p_english
0,a,10144200,0.030759
1,abandon,15323,4.6e-05
2,abil,51476,0.000156
3,abl,103171,0.000313
4,abort,18925,5.7e-05


In [13]:
# Now deal with lyric data

lyrics_path = 'mxm_dataset.db'

In [14]:
conn = sqlite3.connect(lyrics_path)

res = conn.execute("SELECT word, count FROM lyrics")
n = res.fetchall()

In [15]:
n

[(u'i', 6),
 (u'the', 4),
 (u'you', 2),
 (u'to', 2),
 (u'and', 5),
 (u'a', 3),
 (u'me', 1),
 (u'it', 1),
 (u'my', 1),
 (u'is', 2),
 (u'of', 3),
 (u'your', 1),
 (u'that', 1),
 (u'are', 2),
 (u'we', 2),
 (u'am', 2),
 (u'will', 2),
 (u'for', 4),
 (u'be', 1),
 (u'have', 2),
 (u'so', 1),
 (u'this', 1),
 (u'like', 2),
 (u'de', 1),
 (u'up', 1),
 (u'was', 2),
 (u'if', 1),
 (u'got', 1),
 (u'would', 1),
 (u'been', 1),
 (u'these', 2),
 (u'seem', 1),
 (u'someon', 1),
 (u'understand', 1),
 (u'pass', 1),
 (u'river', 1),
 (u'met', 1),
 (u'piec', 1),
 (u'damn', 1),
 (u'worth', 1),
 (u'flesh', 1),
 (u'grace', 1),
 (u'poor', 2),
 (u'somehow', 1),
 (u'ignor', 1),
 (u'passion', 1),
 (u'tide', 1),
 (u'season', 1),
 (u'seed', 1),
 (u'resist', 1),
 (u'order', 2),
 (u'piti', 1),
 (u'fashion', 1),
 (u'grant', 1),
 (u'captur', 2),
 (u'ici', 1),
 (u'soil', 1),
 (u'patienc', 1),
 (u'social', 2),
 (u'highest', 2),
 (u'slice', 1),
 (u'leaf', 1),
 (u'lifeless', 1),
 (u'arrang', 1),
 (u'wilder', 1),
 (u'shark', 1),
 

In [16]:
# MASTERCOUNT! Accumulates counts of words across songs

mastercount = {}

for w in n:
    if w[0] not in mastercount:
        mastercount[w[0]] = w[1]
    else:
        mastercount[w[0]] = mastercount[w[0]] + w[1]

In [17]:
english['lyric_count'] = english['stem'].map(lambda x: mastercount[x] if x in mastercount else 1)

In [18]:
total_lyrics = sum(english['lyric_count'])

In [19]:
english['p_lyrics'] = english['lyric_count'].map(lambda x: x*1.0/total_lyrics)

In [20]:
english.head()

Unnamed: 0,stem,count,p_english,lyric_count,p_lyrics
0,a,10144200,0.030759,974499,0.02385306
1,abandon,15323,4.6e-05,1455,3.56144e-05
2,abil,51476,0.000156,1,2.447725e-08
3,abl,103171,0.000313,1741,4.261489e-05
4,abort,18925,5.7e-05,1,2.447725e-08


In [21]:
english['cliche'] = np.log(english['p_lyrics']/english['p_english'])

In [28]:
max_cliche = english.cliche.idxmax()

print english.stem[max_cliche]
print english.cliche[max_cliche]

am
4.55976364894


In [29]:
np.mean(english.cliche)

-3.141967272801419

In [30]:
english.to_csv('cliche_words.tsv', sep='\t')