In [1]:
import numpy as np 
import pandas as pd 
import os
import re
import string
import numpy as np 
import random
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tqdm import tqdm
import os
import nltk
import spacy
import random

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [4]:
df = pd.read_csv("spam.csv", encoding="latin-1")

df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']

df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))
df.head()

Unnamed: 0,target,message,message_len
0,ham,"Go until jurong point, crazy.. Available only ...",20
1,ham,Ok lar... Joking wif u oni...,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,ham,U dun say so early hor... U c already then say...,11
4,ham,"Nah I don't think he goes to usf, he lives aro...",13


In [6]:
df['message_clean'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,11,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah i dont think he goes to usf he lives aroun...


In [7]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry wkly comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goes usf lives around though


In [8]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [9]:
df['message_clean'] = df['message_clean'].apply(stemm_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


In [10]:
def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [11]:
df['message_clean'] = df['message_clean'].apply(preprocess_data)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say ear hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


In [12]:
df.head(-5)

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say ear hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though
...,...,...,...,...
5562,ham,Ok lor... Sony ericsson salesman... I ask shuh...,18,ok lor soni ericsson salesman ask shuhui say q...
5563,ham,Ard 6 like dat lor.,5,ard like dat lor
5564,ham,Why don't you wait 'til at least wednesday to ...,15,dont wait til least wednesday see get
5565,ham,Huh y lei...,3,huh lei


In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Document Expansion

In [14]:
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
def wordnet_doc_expansion(doc,k=3):
    ps = PorterStemmer()
    upd_doc = doc.split()
    synonyms =[]
    res=[w for w in upd_doc]
    for q in upd_doc:
        q_stem=ps.stem(q)
        for syn in wordnet.synsets(q_stem):
            for l in syn.lemmas():
                synonyms.append(l.name())
        synonyms=list(set(synonyms))
        synonyms=synonyms[:k]
        for w in synonyms:
            w_stem=ps.stem(w)
            if  w_stem!=q_stem:
                  res.append(w)
            synonyms=[]
    return ' '.join(res)

In [15]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [16]:
i = 'mean fat head'
upd_doca = ['mean', 'fat', 'head']
resa =[w for w in upd_doca]
resa

['mean', 'fat', 'head']

In [17]:
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('meaning')

'mean'

In [18]:
documents = df['message_clean']
docs_expansion = []
for i in documents:
  docs_expansion.append(wordnet_doc_expansion(i,1))
# docs_expansion = wordnet_doc_expansion(documents,2)

In [19]:
docs_expansion

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat crack direct help northward with_child earthly_concern Pelican_State east sideboard grow Cupid',
 'ok lar joke wif oni all_right laugh Office_of_Naval_Intelligence',
 'free entri wkli comp win fa cup final tkts may text fa receiv entri questionstd txt ratetc appli free_people advance transfuse terminal',
 'dun say ear hor alreadi say grayish_brown pronounce auricle pronounce',
 'nah dont think goe usf live around though imagine endure close_to',
 'freemsg hey darl week word back id like fun still tb ok xxx std chgs send å£ rcv workweek binding same merriment inactive T.B. all_right 30 social_disease direct',
 'even brother like speak treat like aid patent fifty-fifty comrade same verbalize goody same help manifest',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun bespeak go_under telephoner weigh supporter',
 'winner valu network custom select receiv

In [20]:
#Word Embeddings: GloVe

In [21]:
texts = docs_expansion

# GLOVE

In [22]:
!pip install Tokenizer  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Tokenizer
  Downloading tokenizer-3.4.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.3 MB/s 
[?25hInstalling collected packages: Tokenizer
Successfully installed Tokenizer-3.4.1


In [23]:
# Calculate the length of our vocabulary
from keras.preprocessing.text import Tokenizer
word_tokenizer = Tokenizer()

In [24]:
word_tokenizer.fit_on_texts(texts)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

7998

In [25]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
import tensorflow as tf
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

In [27]:
longest_train = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

train_padded_sentences = tf.keras.preprocessing.sequence.pad_sequences(
    embed(texts), 
    length_long_sentence, 
    padding='post'
)

X = train_padded_sentences

In [28]:
print(X)

[[   4 4202  496 ...    0    0    0]
 [  23  444  869 ...    0    0    0]
 [   8  624  944 ...    0    0    0]
 ...
 [7996 1531 7997 ...    0    0    0]
 [ 283 1867  119 ...    0    0    0]
 [2802  636  335 ...    0    0    0]]


# TFIDF
Menghitung TF IDF untuk setiap dokumen dan hasilnya digunakan untuk clustering dokumen

In [29]:
import pandas as pd
import numpy as np

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X)
print(tfidf.shape )                        

(5572, 114)


# Document Clustering with KMeans

In [31]:
corpus = texts

In [32]:
from sklearn.cluster import KMeans

num_clusters = 5 #Change it according to your data.
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf)
clusters = km.labels_.tolist()

smstext = {'SMS Text' : corpus, 'Cluster Number' : clusters} #Creating dict having doc with the corresponding cluster number.
frame=pd.DataFrame(smstext,index=[clusters], columns=['SMS Text','Cluster Number']) # Converting it into a dataframe.

print("\n")
print(frame) #Print the doc with the labeled cluster number.
print("\n")
print(frame['Cluster Number'].value_counts()) #Print the counts of doc belonging to each cluster.



                                             SMS Text  Cluster Number
2   go jurong point crazi avail bugi n great world...               2
2   ok lar joke wif oni all_right laugh Office_of_...               2
2   free entri wkli comp win fa cup final tkts may...               2
4   dun say ear hor alreadi say grayish_brown pron...               4
4   nah dont think goe usf live around though imag...               4
..                                                ...             ...
2   time tri contact å£ pound prize claim easi cal...               2
4   ì b go esplanad fr home group_B crack atomic_n...               4
3               piti mood soani suggest mode indicate               3
2   guy bitch act like id interest buy someth el n...               2
1                    rofl true name truthful nominate               1

[5572 rows x 2 columns]


2    2307
4    1017
3     805
0     748
1     695
Name: Cluster Number, dtype: int64


In [33]:
frame.head()

Unnamed: 0,SMS Text,Cluster Number
2,go jurong point crazi avail bugi n great world...,2
2,ok lar joke wif oni all_right laugh Office_of_...,2
2,free entri wkli comp win fa cup final tkts may...,2
4,dun say ear hor alreadi say grayish_brown pron...,4
4,nah dont think goe usf live around though imag...,4


In [34]:
doc_id = []
for i in range(1, (len(frame['SMS Text']))+1):
  doc_id.append('D' + str(i))

In [35]:
frame['docID'] = doc_id
frame.head(5)

Unnamed: 0,SMS Text,Cluster Number,docID
2,go jurong point crazi avail bugi n great world...,2,D1
2,ok lar joke wif oni all_right laugh Office_of_...,2,D2
2,free entri wkli comp win fa cup final tkts may...,2,D3
4,dun say ear hor alreadi say grayish_brown pron...,4,D4
4,nah dont think goe usf live around though imag...,4,D5


In [36]:
group_doc = []
for i in frame['Cluster Number']:
  group_doc.append('Group' + str(i))

In [37]:
frame['GroupDoc'] = group_doc
frame.head(5)

Unnamed: 0,SMS Text,Cluster Number,docID,GroupDoc
2,go jurong point crazi avail bugi n great world...,2,D1,Group2
2,ok lar joke wif oni all_right laugh Office_of_...,2,D2,Group2
2,free entri wkli comp win fa cup final tkts may...,2,D3,Group2
4,dun say ear hor alreadi say grayish_brown pron...,4,D4,Group4
4,nah dont think goe usf live around though imag...,4,D5,Group4


In [38]:
df_cluster_0 = frame['SMS Text'][0].reset_index(drop=True)

In [39]:
cluster_0 = []
cluster_1 = []
cluster_2 = []
cluster_3 = []
cluster_4 = []

In [40]:
for clusnum in range (0,5):
  data_in_cluster = frame['SMS Text'][clusnum].reset_index(drop=True)
  
  for doc in data_in_cluster:
    if clusnum == 0:
      cluster_0.append(doc)
    if clusnum == 1:
      cluster_1.append(doc)
    if clusnum == 2:
      cluster_2.append(doc)
    if clusnum == 3:
      cluster_3.append(doc)
    if clusnum == 4:
      cluster_4.append(doc)

In [41]:
clusnum_0_join = ''.join(cluster_0)
clusnum_1_join = ''.join(cluster_1)
clusnum_2_join = ''.join(cluster_2)
clusnum_3_join = ''.join(cluster_3)
clusnum_4_join = ''.join(cluster_4)

In [42]:
d = {'groupID': ['Group0','Group1','Group2','Group3','Group4'], 'text': [clusnum_0_join,clusnum_1_join, clusnum_2_join, clusnum_3_join, clusnum_4_join]}
grupindf = pd.DataFrame(data=d)
grupindf

Unnamed: 0,groupID,text
0,Group0,oh kim watch Ohio vigilserious spell name go n...
1,Group1,iû÷m go tri month ha ha joke crack laughfffff...
2,Group2,go jurong point crazi avail bugi n great world...
3,Group3,lol alway convinck tell anyth one_thousand sep...
4,Group4,dun say ear hor alreadi say grayish_brown pron...


In [43]:
# menghitung jumlah kata di dalam dataset

new_all_text = " ".join(frame['SMS Text'].apply(clean_text).values)
vocab = []
vocab = np.unique(word_tokenize(new_all_text))
vocab = [word for word in vocab if word not in stopwords.words('english')]

In [44]:
vocab

['aa',
 'aah',
 'aaniy',
 'aaooooright',
 'aathilov',
 'aathiwh',
 'ab',
 'abatransitnumber',
 'abbey',
 'abbreviate',
 'abdomen',
 'abeg',
 'abel',
 'aberdeen',
 'abi',
 'abil',
 'abiola',
 'abj',
 'abl',
 'abnorm',
 'abode',
 'abouta',
 'abroad',
 'absenc',
 'absolut',
 'abstract',
 'abt',
 'abta',
 'aburo',
 'abus',
 'ac',
 'academ',
 'academyaward',
 'acc',
 'accent',
 'accentur',
 'accept',
 'access',
 'accid',
 'accolade',
 'accommod',
 'accommodationvouch',
 'accomod',
 'accompany',
 'accomplishment',
 'accord',
 'accordin',
 'accordinglyor',
 'account',
 'accumul',
 'accuracy',
 'accurate',
 'ach',
 'achanammarakheshqatar',
 'achiev',
 'achiever',
 'acid',
 'acknowledg',
 'acknowledgment',
 'acnt',
 'across',
 'acsmsreward',
 'act',
 'actin',
 'action',
 'activ',
 'activate',
 'activity',
 'actor',
 'actual',
 'ad',
 'adam',
 'add',
 'addamsfa',
 'addi',
 'addict',
 'address',
 'addressul',
 'adequate',
 'adew',
 'adhd',
 'adi',
 'adieu',
 'adjoin',
 'adjust',
 'admin',
 'admin

In [45]:
data = frame

In [59]:
def term_document_matrix(data, vocab= None, document_index= 'ID', text= 'text'):
    """Calculate frequency of term in the document.
    
    parameter: 
        data: DataFrame. 
        Frequency of word calculated against the data.
        
        vocab: list of strings.
        Vocabulary of the documents    
        
        document_index: str.
        Column name for document index in DataFrame passed.
        
        text: str
        Column name containing text for all documents in DataFrame,
        
    returns:
        vocab_index: DataFrame.
        DataFrame containing term document matrix.
        """
    
    vocab_index = pd.DataFrame(columns=data[document_index], index= vocab).fillna(0)
    
    for word in vocab_index.index:
        
        for doc in data[document_index]:
            
            freq = data[data[document_index] == doc][text].values[0].count(word)
            vocab_index.loc[word,doc] = freq
    
    return vocab_index

In [60]:
similarity_index = ''
similarity_index = term_document_matrix(grupindf,vocab,'groupID','text')
similarity_index

groupID,Group0,Group1,Group2,Group3,Group4
aa,7,14,11,7,5
aah,0,3,2,0,2
aaniy,0,0,1,0,0
aaooooright,0,1,0,0,0
aathilov,3,0,0,0,0
...,...,...,...,...,...
ì,17,11,82,22,47
ìll,1,0,0,0,1
ì©,0,0,0,0,3
ìï,8,2,18,9,16


In [61]:
# bergantung pada term_document_matrix
def tf_idf_score(vocab_index, document_index, inv_df= 'inverse_document_frequency'):
    """
    Calculate tf-idf score for vocabulary in documents
    
    parameter:
        vocab_index: DataFrame.
        Term document matrix.
        
        document_index: list or tuple.
        Series containing document ids.
        
        inv_df: str.
        Name of the column with calculated inverse document frequencies.
        
    returns:
        vocab_index: DataFrame.
        DataFrame containing term document matrix and document frequencies, inverse document frequencies and tf-idf scores
    """
    total_docx = len(document_index)
    vocab_index['document_frequency'] = vocab_index.sum(axis= 1)
    vocab_index['inverse_document_frequency'] = np.log2( total_docx / vocab_index['document_frequency'])
    
    for word in vocab_index.index:
        
        for doc in document_index:
            
                tf_idf = np.log2(1 + vocab_index.loc[word,doc]) * np.log2(vocab_index.loc[word][inv_df])
                vocab_index.loc[word,'tf_idf_'+doc] = tf_idf
    
    return vocab_index

In [62]:
similarity_index = tf_idf_score(similarity_index, grupindf.groupID.values)
similarity_index



groupID,Group0,Group1,Group2,Group3,Group4,document_frequency,inverse_document_frequency,tf_idf_Group0,tf_idf_Group1,tf_idf_Group2,tf_idf_Group3,tf_idf_Group4
aa,7,14,11,7,5,44,-3.137504,,,,,
aah,0,3,2,0,2,7,-0.485427,,,,,
aaniy,0,0,1,0,0,1,2.321928,0.000000,0.000000,1.215323,0.0,0.000000
aaooooright,0,1,0,0,0,1,2.321928,0.000000,1.215323,0.000000,0.0,0.000000
aathilov,3,0,0,0,0,3,0.736966,-0.880662,-0.000000,-0.000000,-0.0,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
ì,17,11,82,22,47,179,-5.161888,,,,,
ìll,1,0,0,0,1,2,1.321928,0.402644,0.000000,0.000000,0.0,0.402644
ì©,0,0,0,0,3,3,0.736966,-0.000000,-0.000000,-0.000000,-0.0,-0.880662
ìï,8,2,18,9,16,53,-3.405992,,,,,


# Ranking

In [69]:
def query_processing(query):
    """
    Pre-processing query to accomodate calculations for tf-idf score
    
    parameter:
        query: str.
        Textual query input to the system.
        
    returns:
        query: str.
        Cleaned string.
        """
    query= re.sub('\W',' ',query)
    query= query.strip().lower()
    query= " ".join([word for word in query.split() if word not in stopwords.words('english')])
    
    return query

In [70]:
def query_score(vocab_index, query):
    """
    Calculate tf-idf score for query terms
    
    parameter:
        vocab_index: DataFrame.
        Term document matrix with inverse document frequency and term frequencies calculated.
        
        query: str.
        Query submitted to the system
        
    returns:
        vocab_index: DataFrame.
        Term document matrix with tf-idf scores for terms per document and query terms.
    """
    for word in np.unique(query.split()):
        
        freq = query.count(word)
        
        if word in vocab_index.index:
            
            tf_idf = np.log2(1+freq) * np.log2(vocab_index.loc[word].inverse_document_frequency)
            vocab_index.loc[word,"query_tf_idf"] = tf_idf
            vocab_index['query_tf_idf'].fillna(0, inplace=True)
        
        if word not in vocab_index.index:
          vocab_index.loc[word,"query_tf_idf"] = 0
          vocab_index.fillna(0, inplace=True)

    
    return vocab_index

In [71]:
similarity_index.to_csv('term_doc_matrix.csv')

In [72]:
test= pd.read_csv('term_doc_matrix.csv')
test = test.set_index('Unnamed: 0')

In [73]:
# query= "25 batman alone woman"
# similarity_index = query_score(test,query)
# similarity_index

# Rank for Q1

In [74]:
test= pd.read_csv('term_doc_matrix.csv')
test = test.set_index('Unnamed: 0')
test

Unnamed: 0_level_0,Group0,Group1,Group2,Group3,Group4,document_frequency,inverse_document_frequency,tf_idf_Group0,tf_idf_Group1,tf_idf_Group2,tf_idf_Group3,tf_idf_Group4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
aa,7,14,11,7,5,44,-3.137504,,,,,
aah,0,3,2,0,2,7,-0.485427,,,,,
aaniy,0,0,1,0,0,1,2.321928,0.000000,0.000000,1.215323,0.0,0.000000
aaooooright,0,1,0,0,0,1,2.321928,0.000000,1.215323,0.000000,0.0,0.000000
aathilov,3,0,0,0,0,3,0.736966,-0.880662,-0.000000,-0.000000,-0.0,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
ì,17,11,82,22,47,179,-5.161888,,,,,
ìll,1,0,0,0,1,2,1.321928,0.402644,0.000000,0.000000,0.0,0.402644
ì©,0,0,0,0,3,3,0.736966,-0.000000,-0.000000,-0.000000,-0.0,-0.880662
ìï,8,2,18,9,16,53,-3.405992,,,,,


In [75]:
query= "aa aaooooright canteen woman"
similarity_index = ''
similarity_index = query_score(test,query)
similarity_index



Unnamed: 0_level_0,Group0,Group1,Group2,Group3,Group4,document_frequency,inverse_document_frequency,tf_idf_Group0,tf_idf_Group1,tf_idf_Group2,tf_idf_Group3,tf_idf_Group4,query_tf_idf
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
aa,7,14,11,7,5,44,-3.137504,,,,,,0.000000
aah,0,3,2,0,2,7,-0.485427,,,,,,0.000000
aaniy,0,0,1,0,0,1,2.321928,0.000000,0.000000,1.215323,0.0,0.000000,0.000000
aaooooright,0,1,0,0,0,1,2.321928,0.000000,1.215323,0.000000,0.0,0.000000,1.215323
aathilov,3,0,0,0,0,3,0.736966,-0.880662,-0.000000,-0.000000,-0.0,-0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ì,17,11,82,22,47,179,-5.161888,,,,,,0.000000
ìll,1,0,0,0,1,2,1.321928,0.402644,0.000000,0.000000,0.0,0.402644,0.000000
ì©,0,0,0,0,3,3,0.736966,-0.000000,-0.000000,-0.000000,-0.0,-0.880662,0.000000
ìï,8,2,18,9,16,53,-3.405992,,,,,,0.000000


In [76]:
similarity_index = similarity_index.dropna(0)
similarity_index

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Group0,Group1,Group2,Group3,Group4,document_frequency,inverse_document_frequency,tf_idf_Group0,tf_idf_Group1,tf_idf_Group2,tf_idf_Group3,tf_idf_Group4,query_tf_idf
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
aaniy,0,0,1,0,0,1,2.321928,0.000000,0.000000,1.215323,0.0,0.000000,0.000000
aaooooright,0,1,0,0,0,1,2.321928,0.000000,1.215323,0.000000,0.0,0.000000,1.215323
aathilov,3,0,0,0,0,3,0.736966,-0.880662,-0.000000,-0.000000,-0.0,-0.000000,0.000000
aathiwh,0,3,0,0,0,3,0.736966,-0.000000,-0.880662,-0.000000,-0.0,-0.000000,0.000000
abbey,1,0,0,0,0,1,2.321928,1.215323,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
åômorrow,0,0,1,0,0,1,2.321928,0.000000,0.000000,1.215323,0.0,0.000000,0.000000
åôrent,0,0,1,0,0,1,2.321928,0.000000,0.000000,1.215323,0.0,0.000000,0.000000
ìll,1,0,0,0,1,2,1.321928,0.402644,0.000000,0.000000,0.0,0.402644,0.000000
ì©,0,0,0,0,3,3,0.736966,-0.000000,-0.000000,-0.000000,-0.0,-0.880662,0.000000


In [77]:
def cosine_similarity(vocab_index, document_index, query_scores):
    """
    Calculates cosine similarity between the documents and query
    
    parameter:
        
        vocab_index: DataFrame.
        DataFrame containing tf-idf score per term for every document and for the query terms.
        
        document_index: list.
        List of document ids.
        
        query_scores: str.
        Column name in DataFrame containing query term tf-idf scores.
        
    returns:
        cosine_scores: Series.
        Cosine similarity scores of every document.
    """
    cosine_scores = {}
    
    query_scalar = np.sqrt(sum(vocab_index[query_scores] ** 2))
    
    for doc in document_index:
        
        doc_scalar = np.sqrt(sum(vocab_index[doc] ** 2))
        dot_prod = sum(vocab_index[doc] * vocab_index[query_scores])
        cosine = (dot_prod / (query_scalar * doc_scalar))
        
        cosine_scores[doc] = cosine
        
    return pd.Series(cosine_scores)

In [78]:
cosines = cosine_similarity(similarity_index, grupindf.groupID.values, 'query_tf_idf')
cosines

Group0    0.000000
Group1    0.027929
Group2    0.000000
Group3    0.000000
Group4    0.019389
dtype: float64

In [79]:
def retrieve_index(data,cosine_scores, document_index):
    """
    Retrieves indices for the corresponding document cosine scores
    
    parameters:
        data: DataFrame.
        DataFrame containing document ids and text.
        
        cosine_scores: Series.
        Series containing document cosine scores.
        
        document_index: str.
        Column name containing document ids in data.
        
    returns:
        data: DataFrame.
        Original DataFrame with cosine scores added as column.
    """
    
    data = data.set_index(document_index)
    data['scores'] = cosine_scores

    import numpy as np

    data = data.loc[~(data==0).all(axis=1)]
    
    np.count_nonzero(df, axis=1)
    return data.reset_index().sort_values('scores',ascending=False).head()

In [80]:
indices = retrieve_index(grupindf, cosines, 'groupID')
indices

Unnamed: 0,groupID,text,scores
1,Group1,iû÷m go tri month ha ha joke crack laughfffff...,0.027929
4,Group4,dun say ear hor alreadi say grayish_brown pron...,0.019389
0,Group0,oh kim watch Ohio vigilserious spell name go n...,0.0
2,Group2,go jurong point crazi avail bugi n great world...,0.0
3,Group3,lol alway convinck tell anyth one_thousand sep...,0.0
