In [89]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy import spatial

### Toy example

In [40]:
unique_words = sorted(['Amol','Prashant','Mrinal','Saurabh','Diana'])

a = ['Amol', 'Prashant']
b = ['Mrinal','Prashant']
c = ['Saurabh','Amol','Diana']
corpus = [a, b, c]

In [57]:
# Let's play around with an example
dict_dataframe = {i:[corpus[i]] for i in range(len(corpus))}
dict_dataframe

{0: [['Amol', 'Prashant']],
 1: [['Mrinal', 'Prashant']],
 2: [['Saurabh', 'Amol', 'Diana']]}

In [60]:
df = pd.DataFrame(data = dict_dataframe.values(),index=range(len(corpus)),columns=['Documents'])
df

Unnamed: 0,Documents
0,"[Amol, Prashant]"
1,"[Mrinal, Prashant]"
2,"[Saurabh, Amol, Diana]"


In [61]:
df['count_vect'] = df['Documents'].apply(count_vectorizer)
df['tfidf_vect'] = df['Documents'].apply(tfidf_vectorizer)


In [63]:
df['stars'] = [1,2,3]

In [80]:
df['lst_stars'] = df['stars'].apply(lambda x: [x])
df

Unnamed: 0,Documents,count_vect,tfidf_vect,stars,users,Products,lst_stars
0,"[Amol, Prashant]","[1, 0, 0, 1, 0]","[0.14384103622589042, 0, 0, 0.1438410362258904...",1,U1,P1,[1]
1,"[Mrinal, Prashant]","[0, 0, 1, 1, 0]","[0, 0, 0.34657359027997264, 0.1438410362258904...",2,U1,P2,[2]
2,"[Saurabh, Amol, Diana]","[1, 1, 0, 0, 1]","[0.09589402415059362, 0.23104906018664842, 0, ...",3,U2,P2,[3]


In [82]:
df

Unnamed: 0,Documents,count_vect,tfidf_vect,stars,users,Products,lst_stars,values_cnt,values_tfidf
0,"[Amol, Prashant]","[1, 0, 0, 1, 0]","[0.14384103622589042, 0, 0, 0.1438410362258904...",1,U1,P1,[1],"[1, 0, 0, 1, 0, 1]","[0.14384103622589042, 0, 0, 0.1438410362258904..."
1,"[Mrinal, Prashant]","[0, 0, 1, 1, 0]","[0, 0, 0.34657359027997264, 0.1438410362258904...",2,U1,P2,[2],"[0, 0, 1, 1, 0, 2]","[0, 0, 0.34657359027997264, 0.1438410362258904..."
2,"[Saurabh, Amol, Diana]","[1, 1, 0, 0, 1]","[0.09589402415059362, 0.23104906018664842, 0, ...",3,U2,P2,[3],"[1, 1, 0, 0, 1, 3]","[0.09589402415059362, 0.23104906018664842, 0, ..."


In [83]:
df['users'] = ['U1','U1','U2']
df['Products'] = ['P1','P2','P2']

df['values_cnt'] = df['lst_stars'] + df['count_vect']
df['values_tfidf'] = df['lst_stars'] + df['tfidf_vect']

In [84]:
df_cnt = df.pivot(values='values_cnt',index='users',columns='Products')
df_tfidf = df.pivot(values='values_tfidf',index='users',columns='Products')

In [85]:
df_cnt

Products,P1,P2
users,Unnamed: 1_level_1,Unnamed: 2_level_1
U1,"[1, 1, 0, 0, 1, 0]","[2, 0, 0, 1, 1, 0]"
U2,,"[3, 1, 1, 0, 0, 1]"


In [86]:
df_tfidf

Products,P1,P2
users,Unnamed: 1_level_1,Unnamed: 2_level_1
U1,"[1, 0.14384103622589042, 0, 0, 0.1438410362258...","[2, 0, 0, 0.34657359027997264, 0.1438410362258..."
U2,,"[3, 0.09589402415059362, 0.23104906018664842, ..."


In [91]:
def cosine_similarity(v1,v2):
    ans = 1 - spatial.distance.cosine(v1,v2)
    return ans

In [93]:
cosine_similarity(df_tfidf.iloc[0]['P2'][1:],df_tfidf.iloc[1]['P2'][1:])

0.0

In [94]:
print(df_tfidf.iloc[0]['P2'][1:])
print(df_tfidf.iloc[1]['P2'][1:])

[0, 0, 0.34657359027997264, 0.14384103622589042, 0]
[0.09589402415059362, 0.23104906018664842, 0, 0, 0.23104906018664842]


In [48]:
# m --> len of unique words(columns), n --> len of corpus 

def count_vectorizer(col):    # custom countvectorizer
    m = len(unique_words)
    lst = [None] * m
    for i in range(m):
        if full_list[i] in col:
            lst[i] = 1
        else:    
            lst[i] = 0
    return lst    



# Function to check how many documents in the corpus contain the word
def idf(word):
    count = 0
    n = len(corpus)
    for i in range(n):
        if word in corpus[i]:
            count += 1
    return count


def tfidf_vectorizer(col):    # custom tfidfvectorizer
    m = len(unique_words)
    lst = [None]*m
    counter = Counter()
    for word in col:
        counter[word] += 1
    for i in range(m):
        if unique_words[i] in col:
            tf = counter[unique_words[i]]/len(col)
            inv_df = np.log((n + 1)/(idf(unique_words[i]) + 1))
            tfidf = tf*inv_df
            lst[i] = tfidf
        else:
            lst[i] = 0
    return lst

In [42]:
count_vectorizer(c)

[1, 1, 0, 0, 1]

In [45]:
tfidf_vectorizer(c)

[0.09589402415059362, 0.23104906018664842, 0, 0, 0.23104906018664842]

In [47]:
for col in corpus:
    print(f'The CountVector is {count_vectorizer(col)}')
    print(f'The TFIDF vector is {tfidf_vectorizer(col)}')
    print('\n')

The CountVector is [1, 0, 0, 1, 0]
The TFIDF vector is [0.14384103622589042, 0, 0, 0.14384103622589042, 0]


The CountVector is [0, 0, 1, 1, 0]
The TFIDF vector is [0, 0, 0.34657359027997264, 0.14384103622589042, 0]


The CountVector is [1, 1, 0, 0, 1]
The TFIDF vector is [0.09589402415059362, 0.23104906018664842, 0, 0, 0.23104906018664842]


