In [1]:
# imports
import pandas as pd
import mysql.connector
import spacy
from nltk.stem import PorterStemmer

In [2]:
# function declarations

# get papers
def getPapers():
    connection = mysql.connector.connect(
        host="localhost",
        user="root",
        password="password",
        database="mysql"
    )

    # return papers into a pandas dataframe
    return pd.read_sql("SELECT id, title, author, abstract FROM papers", con=connection)

# to lowercase
def toLower(text):
    return text.astype(str).str.lower()

# function to tokenise text
def tokenise(text):
    sp = spacy.load("en_core_web_sm")
    return text.apply(lambda x: sp(x))

# function to remove punctuation instances
def removePunctuation(text):
    return text.str.replace('[^\w\s]','')

# function to remove stop words
def removeStops(text):
    sp = spacy.load("en_core_web_sm")
    stops = sp.Defaults.stop_words
    return [word for word in text if not word in stops]

# calculates and prints the similarity of documents
def getSimilarity(id):
    length = len(dfPapers)
    doc1 = dfPapers['abstract'].loc[0]
    print("Similarities to Document " + str(id+1))
    i = 0
    while i < length:
        if i != id:
            doc2 = dfPapers['abstract'].loc[i]
            print("Document " + str(i+1) + ", Title: " + str(dfPapers['title'].loc[0]) + ", Similarity: " + str(doc1.similarity(doc2)))
        i = i + 1

In [3]:
# database connection to get papers
dfPapers = getPapers()

print(dfPapers)

    id                                              title  \
0    1  A randomized, home-based, childhood obesity in...   
1    5  A three-year multifaceted intervention to prev...   
2    6  A behavioural change intervention study for th...   
3    7  Family-based childhood obesity prevention inte...   
4    8  A systematic review and meta-analysis of the o...   
5    9  Growing Healthy Together: protocol for a rando...   
6   10  OB CITY—Definition of a Family-Based Intervent...   
7   11  Family-based, healthy living\r\nintervention f...   
8   12  Short- and Long-Term Beneficial Effects of a C...   
9   13  Effectiveness of a KindergartenBased Intervent...   
10  14  Riding into Health: A Case Study on an Equine-...   
11  15          Preventing childhood obesity: what works?   
12  17  Evaluation and Treatment of Severe Obesity in ...   
13  18   Prevention Models of Childhood Obesity in Sweden   
14  19  Understanding a successful obesity prevention ...   
15  20  The effect of co

In [4]:
# the data needs to be preprocessed before any analysis can be performed

# forcing relevant fields to lowercase
dfPapers["title"] = toLower(dfPapers["title"])
dfPapers["abstract"] = toLower(dfPapers["abstract"])

# remove punctuation
dfPapers['title'] = removePunctuation(dfPapers['title'])
dfPapers['abstract'] = removePunctuation(dfPapers['abstract'])

# tokenise relevant data
dfPapers['title'] = tokenise(dfPapers['title'])
dfPapers['abstract'] = tokenise(dfPapers['abstract'])

# remove stop words
dfPapers['title'] = removeStops(dfPapers['title'])
dfPapers['abstract'] = removeStops(dfPapers['abstract'])


# errors are encountered here. stemming will improve the accuracy of the model, but we can still operate without it
# stem words
#ps = PorterStemmer()

# new dataframe for final processed data
#dfClean = dfPapers

#dfClean['title'] = dfPapers['title'].apply(lambda x: [ps.stem(y) for y in x])
#dfClean['abstract'] = dfPapers['abstract'].apply(lambda x: [ps.stem(y) for y in x])

# print processed data
print(dfPapers)



    id                                              title  \
0    1  (a, randomized, homebased, childhood, obesity,...   
1    5  (a, threeyear, multifaceted, intervention, to,...   
2    6  (a, behavioural, change, intervention, study, ...   
3    7  (familybased, childhood, obesity, prevention, ...   
4    8  (a, systematic, review, and, metaanalysis, of,...   
5    9  (growing, healthy, together, protocol, for, a,...   
6   10  (ob, citydefinition, of, a, familybased, inter...   
7   11  (familybased, healthy, living, \r\n, intervent...   
8   12  (short, and, longterm, beneficial, effects, of...   
9   13  (effectiveness, of, a, kindergartenbased, inte...   
10  14  (riding, into, health, a, case, study, on, an,...   
11  15      (preventing, childhood, obesity, what, works)   
12  17  (evaluation, and, treatment, of, severe, obesi...   
13  18  (prevention, models, of, childhood, obesity, i...   
14  19  (understanding, a, successful, obesity, preven...   
15  20  (the, effect, of

In [5]:
# calculate cosine similarities and print out for document 1

getSimilarity(0)

Similarities to Document 1
Document 2, Title: a randomized homebased childhood obesity intervention delivered by patient navigators, Similarity: 0.962713185078254
Document 3, Title: a randomized homebased childhood obesity intervention delivered by patient navigators, Similarity: 0.9638105260023572
Document 4, Title: a randomized homebased childhood obesity intervention delivered by patient navigators, Similarity: 0.9449519892119711
Document 5, Title: a randomized homebased childhood obesity intervention delivered by patient navigators, Similarity: 0.8583864570911615
Document 6, Title: a randomized homebased childhood obesity intervention delivered by patient navigators, Similarity: 0.9659924798276139
Document 7, Title: a randomized homebased childhood obesity intervention delivered by patient navigators, Similarity: 0.9404103219131492
Document 8, Title: a randomized homebased childhood obesity intervention delivered by patient navigators, Similarity: 0.9167758717848711
Document 9, Tit

  print("Document " + str(i+1) + ", Title: " + str(dfPapers['title'].loc[0]) + ", Similarity: " + str(doc1.similarity(doc2)))
