## Mini Project - Text Similarity

This project illustrates multiplication of vectors and the use of cosine similarity method for estimation the correlation between vectors.


#### Store some text in three files:

- two of the files, let's call them A and B, contain texts, created by the same author X

- the third text - C - is created by another author - Y.

#### Create a program to convert the texts into vectors, to compare the vectors in pairs::

- A and B

- A and C

- B and C

and to suggest the authors of each text based on the calculated similarity coefficients.. You can use the provided example for directions.

In [2]:
import pandas as pd
import math

In [3]:
# Written by one author - Kim Larsen
koeb_bananer = open('koeb_bananer.txt','r', encoding='utf-8').read().lower()
midt_om_natten = open('midt_om_natten.txt','r', encoding='utf-8').read().lower()

# Written by another author - Sanne Salomonsen
i_en_taxa = open('i_en_taxa.txt','r', encoding='utf-8').read().lower()

In [4]:
koeb_bananer = koeb_bananer.split(' ')
midt_om_natten = midt_om_natten.split(' ')
i_en_taxa = i_en_taxa.split(' ')

In [5]:
all = set(koeb_bananer).union(set(midt_om_natten)).union(set(i_en_taxa))

In [6]:
dict_koeb_bananer = dict.fromkeys(all, 0) 
dict_midt_om_natten = dict.fromkeys(all, 0) 
dict_i_en_taxa = dict.fromkeys(all, 0) 

for word in koeb_bananer:
    dict_koeb_bananer[word]+=1

for word in midt_om_natten:
    dict_midt_om_natten[word]+=1
    
for word in i_en_taxa:
    dict_i_en_taxa[word]+=1

In [7]:
df = pd.DataFrame([dict_koeb_bananer, dict_midt_om_natten, dict_i_en_taxa])

In [9]:
def cosine(vector1, vector2):
     # calculate nominator as a dot product
     intersect = set(vector1.keys()) & set(vector2.keys())
     numerator = sum([vector1[x] * vector2[x] for x in intersect])
    
     # calculate the denominator 
     sum1 = sum([vector1[x] ** 2 for x in list(vector1.keys())])
     sum2 = sum([vector2[x] ** 2 for x in list(vector2.keys())])
    
     denominator = math.sqrt(sum1) * math.sqrt(sum2)
     if not denominator:
         return 0.0
     else:
         return float(numerator)/denominator


In [10]:
cosine(df.loc[0], df.loc[1])

0.22277459534602004

In [11]:
cosine(df.loc[0], df.loc[2])

0.2041844726856066

In [12]:
cosine(df.loc[1], df.loc[2])

0.1918368833932776

In [13]:
def computeTF(dicto, doc):
    tfDict = {}
    corpus = len(doc) # number of all words
    for word, wcount in dicto.items():
        tfDict[word] = wcount/float(corpus) # calculete the proportion
    return(tfDict)

In [14]:
tf1 = computeTF(dict_koeb_bananer, koeb_bananer)
tf2 = computeTF(dict_midt_om_natten, midt_om_natten)
tf3 = computeTF(dict_i_en_taxa, i_en_taxa)

In [15]:
tf = pd.DataFrame([tf1, tf2, tf3])
tf

Unnamed: 0,"bananer,",tog,drengen,pæn,sidde,stuerne.,hos,bananer,håber,du,...,onkel,"mig,",lidt,vand.,havde,mig.,pojken,åh,bli'r,det
0,0.003268,0.0,0.006536,0.009804,0.003268,0.0,0.01634,0.03268,0.0,0.009804,...,0.009804,0.0,0.003268,0.0,0.003268,0.029412,0.003268,0.009804,0.013072,0.029412
1,0.0,0.004695,0.0,0.0,0.0,0.004695,0.0,0.0,0.032864,0.0,...,0.0,0.004695,0.0,0.004695,0.0,0.0,0.0,0.037559,0.004695,0.00939
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
def computeIDF(docList):
    idf = {}
    N = len(docList)
    
    idf = dict.fromkeys(docList[0].keys(), 0)
    for word, wcount in idf.items():
        idf[word] = math.log10(N/(float(wcount) + 1))
        
    return(idf)

In [18]:
idfs = computeIDF([dict_koeb_bananer, dict_midt_om_natten, dict_i_en_taxa])
idfs

{'bananer,': 0.47712125471966244,
 'tog': 0.47712125471966244,
 'drengen': 0.47712125471966244,
 'pæn': 0.47712125471966244,
 'sidde': 0.47712125471966244,
 'stuerne.': 0.47712125471966244,
 'hos': 0.47712125471966244,
 'bananer': 0.47712125471966244,
 'håber': 0.47712125471966244,
 'du': 0.47712125471966244,
 'sin': 0.47712125471966244,
 'kviksølv': 0.47712125471966244,
 'for': 0.47712125471966244,
 'tikker.': 0.47712125471966244,
 "det'": 0.47712125471966244,
 'tættere': 0.47712125471966244,
 'vinduerne.': 0.47712125471966244,
 "nærmer'": 0.47712125471966244,
 'bindegal.': 0.47712125471966244,
 'kaldte': 0.47712125471966244,
 'med.': 0.47712125471966244,
 'noget!': 0.47712125471966244,
 'der': 0.47712125471966244,
 'af': 0.47712125471966244,
 'får': 0.47712125471966244,
 'sgu': 0.47712125471966244,
 'kan': 0.47712125471966244,
 'lys': 0.47712125471966244,
 'næppe': 0.47712125471966244,
 'mig': 0.47712125471966244,
 'men': 0.47712125471966244,
 'natten.': 0.47712125471966244,
 'et': 0

In [19]:
def computeTFIDF(tf, idfs):
    tfidf = {}
    for word, wcount in tf.items():
        tfidf[word] = wcount*idfs[word]
    return(tfidf)

In [20]:
idf1 = computeTFIDF(tf1, idfs)
idf2 = computeTFIDF(tf2, idfs)
idf3 = computeTFIDF(tf3, idfs)

In [21]:
idf= pd.DataFrame([idf1, idf2, idf3])
idf

Unnamed: 0,"bananer,",tog,drengen,pæn,sidde,stuerne.,hos,bananer,håber,du,...,onkel,"mig,",lidt,vand.,havde,mig.,pojken,åh,bli'r,det
0,0.001559,0.0,0.003118,0.004678,0.001559,0.0,0.007796,0.015592,0.0,0.004678,...,0.004678,0.0,0.001559,0.0,0.001559,0.014033,0.001559,0.004678,0.006237,0.014033
1,0.0,0.00224,0.0,0.0,0.0,0.00224,0.0,0.0,0.01568,0.0,...,0.0,0.00224,0.0,0.00224,0.0,0.0,0.0,0.01792,0.00224,0.00448
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
