In [2]:
#######################################################
# Created-Date: 04/14/2016                            #
# Last Modified: 04/27/2016                           #
# Created by: Himanshu Bharara , Asish Mahapatra      #
#                                                     #
# Objective: Calculate the TF-IDF vector              #
# file by taking lyrics of 100 songs for each year    #
# sequentially                                        #
#                                                     #
# Instruction before running:                         #
# 1. Please change the file path of directory         # 
# 2. change ranges based on the number of lyrics      # 
#    you have in the directory                        #  
# 3. Please follow similar nomenclature as used in the# 
#    code                                             #
#######################################################



import os.path
import string #allows for format()
import math
import numpy as np
from nltk.corpus import wordnet
import csv
import re

def main(year):
        
        print(curdir)  # Print Home Directory

        dHome = os.path.join(curdir,"Lyrics/{}/".format(year))
        print(dHome)
        lyrics_array = []
        print(lyrics_array)
        for i in range(0,1):    # Extract all the lyrics of the song for the given year
                try:
                    name=dHome+str(i)+".txt"
                    file=open(name,'r+')
                    lyrics_array.append(file.read())
                    file.close()
                except ValueError:
                    print(i)

        dHome = os.path.join(curdir, "Lyrics/{}/".format(year))
        file=open(name,'r+')

        df = {}
        tf = {}

        for i,doc in enumerate(lyrics_array):
            for word in doc.split():
                k = word
                word = word.lower().strip()  # Convert words to lower case and then strip words from sentence to create an array
                word = re.sub(r'[^a-z]', '', word) # Use Regular Expression to remove apostrophes and other puncutation or semantic errors
                if not tf.get(word):
                        temp_list = []
                        for j in range(0,1):
                                if j == i:
                                        temp_list.append([i,1])
                                else:
                                        temp_list.append([j,0])
                        tf[word] = temp_list
                else:
                        tf[word][i] = [i, tf[word][i][1]+1]
                df[word] = df.get(word, []) + [k]
                
        #del df['']
        #del tf['']
        vocabulary = list(df.keys())

        doc_term_matrix = []
        for doc in range(0,1):                   # Build vocabulary from the entire 100 lyrics file to create a N size vector
                tf_vector = [tf[word][doc][1] for word in vocabulary]
                doc_term_matrix.append(tf_vector)

        print ("All combined, here is our master document term matrix:")

        def l2_normalizer(vec):                  # Normalize the 100 X N vector by dividing the Term Frequency / Document Frequency
            denom = np.sum([el**2 for el in vec])
            if denom == 0:
                return [(0) for el in vec]
            else:
                return [(el / math.sqrt(denom)) for el in vec]
           
        doc_term_matrix_l2 = []
        for vec in doc_term_matrix:
            doc_term_matrix_l2.append(l2_normalizer(vec))

        print ('A regular old document term matrix: ')
        print ('\nA document term matrix with row-wise L2 norms of 1:')


        def numDocsContaining(word, doclist):
            doccount = 0
            for element in tf[word]:
                    if element[1] > 0:
                            doccount += 1
            return doccount 

        def idf(word, doclist):
            n_samples = len(doclist)
            df = numDocsContaining(word, doclist)
            return np.log(n_samples / 1+df)

        my_idf_vector = [idf(word, lyrics_array) for word in vocabulary]

        print(len(vocabulary))

        def build_idf_matrix(idf_vector):
            idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
            np.fill_diagonal(idf_mat, idf_vector)
            return idf_mat

        my_idf_matrix = build_idf_matrix(my_idf_vector)

        doc_term_matrix_tfidf = []
        #performing tf-idf matrix multiplication
        for tf_vector in doc_term_matrix:
            doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))

        #normalizing
        doc_term_matrix_tfidf_l2 = []
        for tf_vector in doc_term_matrix_tfidf:
            doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))

        a = np.array(doc_term_matrix_tfidf_l2)
        print("tfidf done, shape is: {}".format(a.shape))
                                
        list1 = vocabulary
        list2 = ['anger', 'surprise', 'joy', 'sadness', 'love', 'fear']

        list3 = []

        for i,word1 in enumerate(list1):
            k = []
            for word2 in list2:
                wordFromList1 = wordnet.synsets(word1)
##                wordFromList2 = wordnet.synsets(word2)
                wordFromList2n = wordnet.synsets(word2, pos = 'n')
                wordFromList2v = wordnet.synsets(word2, pos = 'v')
                q,s = 0,0
                if wordFromList1 and wordFromList2n: 
                    s = max(syn.wup_similarity(wordFromList2n[0]) if syn.wup_similarity(wordFromList2n[0])
                         is not None else 0 for syn in wordFromList1)

##                if wordFromList1 and wordFromList2:
##                    s = wordFromList1[0].wup_similarity(wordFromList2[0])
##                    if s == None:
##                        k.append(0.)
##                        continue
##                    k.append(s)
                if wordFromList1 and wordFromList2v: #Thanks to @alexis' note
                    q = max(syn.wup_similarity(wordFromList2v[0]) if syn.wup_similarity(wordFromList2v[0])
                         is not None else 0 for syn in wordFromList1)

                k.append(max(q,s))
            if k == [] or k == [None, None, None, None, None, None]:
                k = [0., 0., 0., 0., 0., 0.]
            list3.append(k)

        count = 0
        for row in list3:
                if row == [0, 0, 0, 0, 0, 0]:
                        count += 1
        b = np.array(list3)
        print(b.shape, "count of zeros:", count)

        output = np.dot(doc_term_matrix_tfidf_l2, b)
        return output

if __name__ == "__main__":
    years = range(2040,2041)
    curdir = r"/Users/HimanshuBharara/Documents/CU-Sem2/IEORE4571/Projects/"
    for year in years:
        result = main(year)
        with open(os.path.join(curdir, "{}_clean1.csv".format(year)), "w", newline = '') as f:
            a = csv.writer(f)
            for row in result:
                a.writerow(row)
        print("{} done".format(year))

/Users/HimanshuBharara/Documents/CU-Sem2/IEORE4571/Projects/
/Users/HimanshuBharara/Documents/CU-Sem2/IEORE4571/Projects/Lyrics/2040/
[]
All combined, here is our master document term matrix:
A regular old document term matrix: 

A document term matrix with row-wise L2 norms of 1:
117
tfidf done, shape is: (1, 117)
(117, 6) count of zeros: 44
2040 done
