# Assignment 4.2

Using the Corpus based on the Wikipedia featured articles dataset from the lesson on text processing, select five articles. You will use these articles to calculate the document similarity. Report the similarity between the documents for each distance measure using a table in the following format.

In [None]:
           |Article 1  |  Article 2  |  Article 3  |  Article 4  |  Article 5|
Article 1  |x.yy       |x.yy         |x.yy         |x.yy         |x.yy       |
Article 2  |x.yy       |x.yy         |x.yy         |x.yy         |x.yy       |
Article 3  |x.yy       |x.yy         |x.yy         |x.yy         |x.yy       |
Article 4  |x.yy       |x.yy         |x.yy         |x.yy         |x.yy       |
Article 5  |x.yy       |x.yy         |x.yy         |x.yy         |x.yy       |

# Declaration, reading and data preparation

In [2]:
# Import libraries
import pandas as pd
import json
from nltk.corpus import stopwords
import re
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create stop words
stop_words = stopwords.words('english')

In [3]:
# Read the json files as data frame
def readfiles(dirdata, infile):
    data = []

    full_filename = "%s/%s" % (dirdata, infile)
        
    with open(full_filename,'r') as fi:
        for line in fi:
            data.append(json.loads(line))

    # Create data frame from the json data
    outdf = pd.DataFrame(data)
    
    return outdf

In [4]:
# Make everything lower case, remove punctuation and newline
def cleantext(df):
    punc = string.punctuation.replace('<', '').replace('>', '')
    pat = re.compile(f'[{punc}]')
    
    # Change text to lower case
    df = df.apply(lambda x: x.astype(str).str.lower())
    
    # Remove punctuation
    df = df.replace(pat, '')
    
    # Replace newline
    df = df.replace(r'\\n',' ', regex=True)
    
    df = df.replace(r'\\',' ', regex=True)
    
    return df

In [5]:
# Create set of words from each data frame text column
def createwordset(df, col):
    results = set()
    df[col].str.split().apply(results.update)
    return results

In [9]:
# Define files and location
dirdata = 'data/wikipedia/featured-articles'

files = ['featured-articles_011.jsonl',
         'featured-articles_012.jsonl',
         'featured-articles_013.jsonl',
         'featured-articles_014.jsonl',
         'featured-articles_015.jsonl']

# Read files, clean and store as data frame
article0 = cleantext(readfiles(dirdata, files[0]))
article1 = cleantext(readfiles(dirdata, files[1]))
article2 = cleantext(readfiles(dirdata, files[2]))
article3 = cleantext(readfiles(dirdata, files[3]))
article4 = cleantext(readfiles(dirdata, files[4]))

In [10]:
# Remove stop words
article0_txt = pd.DataFrame()
article1_txt = pd.DataFrame()
article2_txt = pd.DataFrame()
article3_txt = pd.DataFrame()
article4_txt = pd.DataFrame()

article0_txt['texts'] = article0['section_texts'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))
article1_txt['texts'] = article1['section_texts'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))
article2_txt['texts'] = article2['section_texts'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))
article3_txt['texts'] = article3['section_texts'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))
article4_txt['texts'] = article4['section_texts'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))

txt0 = article0_txt['texts'].str.cat(sep=' ')
txt1 = article1_txt['texts'].str.cat(sep=' ')
txt2 = article2_txt['texts'].str.cat(sep=' ')
txt3 = article3_txt['texts'].str.cat(sep=' ')
txt4 = article4_txt['texts'].str.cat(sep=' ')

In [11]:
# Create sets of words to compare
d0_words = {}
d1_words = {}
d2_words = {}
d3_words = {}
d4_words = {}
col = 'texts'

d0_words = createwordset(article0_txt, col)
d1_words = createwordset(article1_txt, col)
d2_words = createwordset(article2_txt, col)
d3_words = createwordset(article3_txt, col)
d4_words = createwordset(article4_txt, col)

# Jaccard Distance

In [None]:
# Calculate jaccard distance
def jaccard_distance(d1_words, d2_words):
    d1_unique = set(d1_words)
    d2_unique = set(d2_words)
    num_both = len(d1_unique.intersection(d2_unique))
    num_total = len(d1_unique.union(d2_unique))
    return num_both/num_total

In [17]:
# Create final output
words = ['Article 11','Article 12','Article 13','Article 14','Article 15']

lst_jd = {words[0]:[jaccard_distance(d0_words, d0_words),
                  jaccard_distance(d0_words, d1_words),
                  jaccard_distance(d0_words, d2_words),
                  jaccard_distance(d0_words, d3_words),
                  jaccard_distance(d0_words, d4_words)],
         words[1]:[jaccard_distance(d1_words, d0_words),
                  jaccard_distance(d1_words, d1_words),
                  jaccard_distance(d1_words, d2_words),
                  jaccard_distance(d1_words, d3_words),
                  jaccard_distance(d1_words, d4_words)],
         words[2]:[jaccard_distance(d2_words, d0_words),
                  jaccard_distance(d2_words, d1_words),
                  jaccard_distance(d2_words, d2_words),
                  jaccard_distance(d2_words, d3_words),
                  jaccard_distance(d2_words, d4_words)],
         words[3]:[jaccard_distance(d3_words, d0_words),
                  jaccard_distance(d3_words, d1_words),
                  jaccard_distance(d3_words, d2_words),
                  jaccard_distance(d3_words, d3_words),
                  jaccard_distance(d3_words, d4_words)],
         words[4]:[jaccard_distance(d4_words, d0_words),
                  jaccard_distance(d4_words, d1_words),
                  jaccard_distance(d4_words, d2_words),
                  jaccard_distance(d4_words, d3_words),
                  jaccard_distance(d4_words, d4_words)],} 
  
# Create DataFrame 
df_jd = pd.DataFrame(lst_jd)

df_jd.set_index([words], inplace = True)

print(df_jd)

            Article 11  Article 12  Article 13  Article 14  Article 15
Article 11    1.000000    0.318816    0.321713    0.320637    0.312409
Article 12    0.318816    1.000000    0.317617    0.316146    0.303839
Article 13    0.321713    0.317617    1.000000    0.322966    0.313422
Article 14    0.320637    0.316146    0.322966    1.000000    0.322123
Article 15    0.312409    0.303839    0.313422    0.322123    1.000000


# Cosine Similarity

In [None]:
# Vectorize using tf/idf
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = TfidfVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

# Create cosine similarity
def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)

In [22]:
# Apply cosine similarity
cosim = get_cosine_sim(txt0, txt1, txt2, txt3, txt4)
cosim

array([[1.        , 0.91942789, 0.91489015, 0.91551904, 0.89558325],
       [0.91942789, 1.        , 0.91315338, 0.91265633, 0.88628739],
       [0.91489015, 0.91315338, 1.        , 0.91370187, 0.89143987],
       [0.91551904, 0.91265633, 0.91370187, 1.        , 0.90381425],
       [0.89558325, 0.88628739, 0.89143987, 0.90381425, 1.        ]])

In [39]:
# Create dataframe from ndarray so that we can output the result in the required format
df_cosim = pd.DataFrame({'Article 11':cosim[:,0],
                         'Article 12':cosim[:,1],
                         'Article 13':cosim[:,2],
                         'Article 14':cosim[:,3],
                         'Article 15':cosim[:,4]})
df_cosim.set_index([words], inplace = True)

print(df_cosim)

            Article 11  Article 12  Article 13  Article 14  Article 15
Article 11    1.000000    0.919428    0.914890    0.915519    0.895583
Article 12    0.919428    1.000000    0.913153    0.912656    0.886287
Article 13    0.914890    0.913153    1.000000    0.913702    0.891440
Article 14    0.915519    0.912656    0.913702    1.000000    0.903814
Article 15    0.895583    0.886287    0.891440    0.903814    1.000000


**End of code**