### Import library

In [102]:
import pandas as pd
pd.set_option("display.max_colwidth", 200)
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

### Read and describe data from People wiki with pandas

In [82]:
df = pd.read_csv('./people_wiki.csv', sep=',',
                   encoding='latin-1', low_memory=False)


In [103]:
df.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his ...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from university of chicago in 1973 after studying psychiatry pharmacology and ophthalmology he is a full professor and vicechair of the department of psychia...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player who has been active in canadas blues scene since 1982 hailing from vancouver he crossed tens of thousands of miles playing club dates and festivals i...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower austria austria on 18 january 1942 is an austrian publisher and critic in the fields of science fiction and the fantasticrottensteiner studied journ...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn better known by his stagename genka is an estonian rapper and record producergenka started rapping in 1996 along with revo and dj paul oja who was gen...


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59071 entries, 0 to 59070
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URI     59071 non-null  object
 1   name    59071 non-null  object
 2   text    59071 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [104]:
df.describe().T

Unnamed: 0,count,unique,top,freq
URI,59071,59071,<http://dbpedia.org/resource/Digby_Morrell>,1
name,59071,59070,author),2
text,59071,59071,digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his ...,1


### Save value from text column to sentences array

In [86]:
sentences = df["text"]
sentences[0]

'digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his early senior football for west perth his 44game senior career for the falcons spanned 19982000 and he was the clubs leading goalkicker in 2000 at the age of 21 morrell was recruited to the australian football league by the kangaroos football club with its third round selection in the 2001 afl rookie draft as a forward he twice kicked five goals during his time with the kangaroos the first was in a losing cause against sydney in 2002 and the other the following season in a drawn game against brisbaneafter the 2003 season morrell was traded along with david teague to the carlton football club in exchange for corey mckernan he played 32 games for the blues before being delisted at the end of 2005 he continued to play victorian football league vfl football with the northern bullants carltons vfla

### Build inverted index

In [87]:
# Declare dictionary of terms in sentences
dictionary = {}
# Declare stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english", ignore_stopwords=True)


In [88]:
for senIdx, sentence in enumerate(sentences):
    # Tokenize words in sentence
    words = word_tokenize(sentence)
    for word in words:
        # Stemming words(remove morphological in word)
        word = stemmer.stem(word)
        # Ignore stop words in dictionary
        if word not in stop_words:
            # Ignore duplicate words in dictionary
            if word not in dictionary.keys():
                dictionary[word] = []
            dictionary[word] += [senIdx+2]


In [89]:
# Remove duplicate document index in dictionary
for key, value in dictionary.items():
    dictionary[key] = list(set(value))
# Convert dictionary to list
data_items = dictionary.items()
data_list = list(data_items)


In [105]:
# Convert list to dataframe for visualization
inverted_index = pd.DataFrame(data_list,columns=['term','document'])
inverted_index

Unnamed: 0,term,document
0,digbi,"[51745, 1698, 4802, 2, 39620, 21658, 52744, 51951, 58288, 46291, 42965, 4726, 16890, 22655]"
1,morrel,"[2, 44327, 51403, 41077, 45368, 25817]"
2,born,"[2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 60, 61, 6..."
3,10,"[32769, 2, 32772, 32775, 8, 32777, 16, 21, 23, 24, 32796, 30, 31, 38, 32811, 53, 54, 32821, 64, 65, 32855, 32859, 103, 32877, 125, 126, 32918, 154, 32922, 32923, 32924, 159, 32927, 161, 165, 32935..."
4,octob,"[2, 7, 32781, 23, 32794, 28, 30, 31, 32800, 37, 32805, 41, 32812, 32823, 76, 77, 32858, 32859, 93, 97, 98, 101, 102, 103, 32870, 32874, 107, 32880, 117, 32888, 121, 124, 32893, 32894, 127, 128, 12..."
5,1979,"[2, 8195, 49154, 40966, 16392, 57352, 16398, 32782, 57361, 16405, 32790, 57366, 40991, 57375, 24609, 8227, 40995, 40996, 49191, 57386, 16427, 32814, 49198, 57394, 49203, 16440, 8250, 41026, 8260, ..."
6,former,"[32768, 32769, 2, 32776, 32777, 32780, 32781, 32783, 16, 32785, 24, 32792, 27, 32796, 32797, 30, 32, 34, 35, 32802, 32805, 39, 32807, 41, 32809, 43, 32810, 32813, 32814, 32815, 48, 32817, 50, 51, ..."
7,australian,"[24576, 8193, 2, 24577, 40962, 8197, 16390, 57347, 57352, 57354, 57356, 16398, 32785, 32786, 40981, 24601, 28, 8228, 40996, 49189, 32809, 42, 8236, 24620, 57390, 24627, 32820, 8249, 24635, 49211, ..."
8,rule,"[2, 40962, 8197, 32781, 16398, 40974, 8208, 57357, 40979, 8216, 49177, 16410, 16420, 40998, 24615, 8236, 32813, 57390, 24627, 8245, 54, 24630, 8258, 16454, 57415, 49225, 8268, 57430, 24663, 24664,..."
9,footbal,"[2, 32775, 32780, 19, 23, 24, 32796, 30, 43, 50, 32819, 65, 32839, 77, 32853, 32857, 32859, 94, 97, 32868, 101, 103, 120, 32888, 124, 125, 138, 32908, 32917, 150, 153, 154, 155, 32926, 32927, 3293..."


### Find document contain keyword

In [91]:
def queryStringContainKeyword(keyword):
   return inverted_index[inverted_index.term.str.contains(keyword)]

In [92]:
def queryStringKeyword(keyword):
   return inverted_index[inverted_index.term == keyword]

In [106]:
queryStringContainKeyword("career")


Unnamed: 0,term,document
22,career,"[2, 32771, 32775, 32776, 32777, 32780, 15, 32787, 32790, 23, 27, 30, 32799, 32, 33, 32802, 35, 42, 32811, 32814, 49, 50, 32817, 32818, 53, 54, 55, 32819, 32821, 32823, 32825, 32827, 32828, 32831, ..."
1832,careerh,"[41344, 51840, 55683, 46980, 22278, 49030, 52742, 23433, 40458, 47500, 11917, 21261, 50835, 22, 10007, 44055, 35737, 26526, 46878, 46880, 2724, 23159, 18728, 28199, 48424, 54569, 50092, 5422, 7346..."
2054,careeraft,"[19473, 53528, 25, 154, 30362, 3486, 934, 26407, 14250, 41386, 10925, 39611, 6847, 12353, 4418, 6469, 30149, 2122, 6986, 23503, 47696, 40658, 3413, 42073, 12381, 7390, 5853, 24159, 38749, 5218, 43..."
2653,careerhigh,"[11264, 52736, 10758, 11273, 14857, 12811, 4620, 38924, 32271, 3600, 4627, 11284, 12819, 31256, 30745, 36376, 49184, 35, 57892, 47144, 2601, 57384, 6699, 33326, 12335, 6708, 46646, 53818, 28731, 5..."
3523,careery,[53]
8032,careermellor,[171]
11400,mlbcareer,[278]
12243,careerend,"[35075, 17148, 51846, 38924, 48014, 56980, 8606, 29215, 13728, 58924, 301, 2478, 10933, 26933, 33206, 42305, 38357, 55894, 58459, 25437, 51557, 13544, 19817, 56937, 6508, 25069, 14199, 54776, 3110..."
15679,careerdefin,"[26600, 29262, 27311, 28023, 441]"
15717,careermcbrid,[441]


In [107]:
queryStringKeyword("career")


Unnamed: 0,term,document
22,career,"[2, 32771, 32775, 32776, 32777, 32780, 15, 32787, 32790, 23, 27, 30, 32799, 32, 33, 32802, 35, 42, 32811, 32814, 49, 50, 32817, 32818, 53, 54, 55, 32819, 32821, 32823, 32825, 32827, 32828, 32831, ..."
