In [34]:
from imp import reload
import gensim
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib as mpl
import nltk,re,pprint
import sys,glob,os
import operator, string, argparse, math

# class to read and preprocess data
class dataProcessor:
    def __init__(self, fname, keepFactors):
        #keep_factors = ['Job Description', 'Company Name', 'Industry']
        self.dataInitial = pd.read_csv(fname, encoding="latin")
        if keepFactors:
            self.dataInitialSmall = self.dataInitial[['Job Description', 'Company Name', 'Industry']]
        else:
            self.dataInitialSmall = None

    # pipeline for purifying the text, write-pipeline, so just output filename can be provided
    def rem_stop_punct(self,originalText, ofilename):
        splittedText = originalText.split()
        lenl = len(splittedText)
        #print("Length is: ",lenl, splittedText[:5])
        ofile = open(ofilename,'a')
        
        for r in range(lenl):
            linex = splittedText[r]
            linex2 = "".join(c for c in linex if c not in ('!','.',':',',','?',';','``','&','-','"','(',')','[',']','0','1','2','3','4','5','6','7','8','9'))
            linex3 = linex2.split()
            #prog=(r+1)/len(rawlines)
            for s in range(len(linex3)):
                noword = linex3[s].lower()
                if noword not in self.swords:
                    ofile.write(noword)
                    ofile.write(" ")

# primary tf-idf class
class flingTFIDF:
    def __init__(self,data,cname):
        self.idfMatrix = {}
        self.distanceMatrix = {}
        self.termsforIDF = []
        self.cname = cname
        self.data = data
        self.lenv = len(self.data)
        self.swords = set(stopwords.words('english'))

    def drawProgressBar(self,percent, barLen = 50):			#just a progress bar so that you dont lose patience
        sys.stdout.write("\r")
        progress = ""
        for i in range(barLen):
            if i<int(barLen * percent):
                progress += "="
            else:
                progress += " "
        sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
        sys.stdout.flush()

    def rem_stop_punct(self,originalText):
        splittedText = originalText.split()
        lenl = len(splittedText)
        wordFiltered = []
        tSent = []
        for r in range(lenl):
            wordx_1 = splittedText[r]
            wordx_2 = "".join(c for c in wordx_1 if c not in ('!','.',':',',','?',';','``','&','-','"','(',')','[',']','0','1','2','3','4','5','6','7','8','9')) 
            sWord = wordx_2.lower()
            if sWord not in self.swords:
                tSent.append(sWord)
        return " ".join(tSent)
        
    def smartTokenizeColumn(self):
        self.stopsRemoved = []
        for index, row in self.data.iterrows():
            prog=(index+1)/self.lenv
            originText = row[self.cname]
            sentx = self.rem_stop_punct(originText)
            self.drawProgressBar(prog)
            self.data.loc[index,'stopsRemoved'] = sentx
        self.cname = 'stopsRemoved'
        
    def getTF(self):
        print("\nAdding term frequency column based on",self.cname)
        tfMatrixList = []
        for index, row in self.data.iterrows():
            words_in_column = row[self.cname].split()
            if len(words_in_column)!=0:
                counts_all = Counter(words_in_column)
                words, count_values = zip(*counts_all.items())
                values_sorted, words_sorted = zip(*sorted(zip(count_values, words), key=operator.itemgetter(0), reverse=True))
                tfMatrixList.append(pd.DataFrame({'word': words_sorted, 'tf': values_sorted}))
                #self.data.loc[index,'tfMatrix'] = countdf
            else:
                #self.data.loc[index,'tfMatrix'] = pd.DataFrame(columns = ['word','tf'])
                tfMatrixList.append(pd.DataFrame(columns = ['word','tf']))
            prog=(index+1)/self.lenv
            self.drawProgressBar(prog)
        self.data['tfMatrix'] = tfMatrixList
        
    def getTFIDF(self):
        print("\nComputing and adding TF-IDF column based on",self.cname)
        for index, row in self.data.iterrows():
            tfmatrixThisrow = row['tfMatrix']
            tempTFIDF = []
            for indx, rwx in tfmatrixThisrow.iterrows():
                trmx = rwx['word']
                tfx = rwx['tf']
                idfx = self.idfMatrix[trmx]
                tfidfx = tfx*idfx
                tempTFIDF.append(tfidfx)
                #tfmatrixThisrow.loc[index,'tf-idf'] = tfidfx
            tfmatrixThisrow['tf-idf'] = tempTFIDF
            #sumtfidf = tfmatrixThisrow['tf-idf'].sum() 
            prog=(index+1)/self.lenv
            self.drawProgressBar(prog)
                
    def computeIDFlistofterms(self):
        totalwords = 0
        print("\nComputing list of words for IDF...\n")
        for index, row in self.data.iterrows():
            words_in_column = set(row[self.cname].split())  
            for word in words_in_column:
                if word not in self.termsforIDF:
                    self.termsforIDF.append(word)
                    totalwords+=1
        print("Created list of terms for IDF matrix with", totalwords," terms.")     
        
    def getIdf(self,term):
        countPresentDocs = 0
        lenidf = len(self.termsforIDF)
        for i in range(lenidf):
            tfx = self.getTermFreq(i,term)
            if tfx>0:
                countPresentDocs+=1
            prog=(i+1)/lenidf
            self.drawProgressBar(prog)
        return countPresentDocs
        
    def computeIDFmatrix(self):
        self.computeIDFlistofterms()
        print("\nComputing global IDF matrix...\n")
        for term in self.termsforIDF:
            self.idfMatrix[term]=0
        for index, row in self.data.iterrows():
            listofterms = list(self.data['tfMatrix'][index]['word'])
            for term in listofterms:
                self.idfMatrix[term]=self.idfMatrix[term]+1
            prog=(index+1)/self.lenv
            self.drawProgressBar(prog)
        for term in self.termsforIDF:
            idfx = self.idfMatrix[term]          
            idfy = self.lenv/float(1+idfx)
            idfz = math.log(idfy,10)
            self.idfMatrix[term] = idfz
            
    def showData(self):
        print(self.data['tfMatrix'])
        
    def createDistanceMetadata(self):
        #sumList = []
        for index, row in self.data.iterrows():
            tfmatrixThisrow = row['tfMatrix']
            sumTFIDF = tfmatrixThisrow['tf-idf'].sum()
            #sumList.append({'sumTFIDF':sumTFIDF})
            self.data.loc[index,'sumTFIDF'] = sumTFIDF
            prog=(index+1)/self.lenv
            self.drawProgressBar(prog)
              
    def distanceBtnTwoDocs(self, docId_1, docId_2):
        listWords_1 = set(list(self.data['tfMatrix'][docId_1]['word']))
        listWords_2 = set(list(self.data['tfMatrix'][docId_2]['word']))
        common = listWords_1.intersection(listWords_2)
        diff1_2 = listWords_1.difference(listWords_2)
        diff2_1 = listWords_2.difference(listWords_1)
        sumwt1 = self.data['sumTFIDF'][docId_1]
        sumwt2 = self.data['sumTFIDF'][docId_2]
        score_common, score_doc1, score_doc2 = 0,0,0
        for word_c in common:
            score_1 = float(self.data['tfMatrix'][docId_1].loc[self.data['tfMatrix'][docId_1]['word'] == word_c]['tf-idf'])
            score_2 = float(self.data['tfMatrix'][docId_2].loc[self.data['tfMatrix'][docId_2]['word'] == word_c]['tf-idf'])
            score_common += abs(score_1/float(sumwt1) - score_2/float(sumwt2))
        for word_d12 in diff1_2:
            score_1 = float(self.data['tfMatrix'][docId_1].loc[self.data['tfMatrix'][docId_1]['word'] == word_d12]['tf-idf'])
            score_doc1 += score_1/float(sumwt1)
        for word_d21 in diff2_1:
            score_2 = float(self.data['tfMatrix'][docId_2].loc[self.data['tfMatrix'][docId_2]['word'] == word_d21]['tf-idf'])
            score_doc2 += score_2/float(sumwt2)
        score_total = score_common + score_doc1 + score_doc2
        return(score_total)
    
    def computeDistanceBtnAllDocs(self):
        for j in range(100):
            for k in range(10):
                numx = j*10+k
                dist = self.distanceBtnTwoDocs(j,k)
                self.distanceMatrix[(j,k)] = dist
                prog=(numx+1)/1000
                self.drawProgressBar(prog)
                
        print(self.distanceMatrix[:10])
    
    def writeToFile(self,fname):
        self.data.to_csv(fname)

In [95]:
import matplotlib as mpl
from imp import reload
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk,re,pprint
import sys,glob,os
import operator, string, argparse, math, random, statistics
import matplotlib.pyplot as plt
from sklearn import metrics

class flingPretrained:
    def __init__(self,data):
        self.data = data
        self.nDocs = len(self.data)
        self.nDocsTest = 0
        self.allDistances = {}
        self.groupedCharacteristic = {'glove' : None, 'vec_tfidf-doc2vec' : None, 'vec_tfidf-glove' : None, 'doc2vec' : None}
        self.wordVecModel = {'glove':None, 'doc2vec':None}
        print("\nDBSCAN initialized!\n")
        
    def loadPretrainedWordVectors(self,vecType):
        if vecType == 'glove':
            self.wordVecModel['glove'] = self.loadGloveModel()
            print("GloVe Vectors Loaded!\n") 

    def loadGloveModel(self):
        print("Loading Glove Model\n")
        try:
            f = open('../datasets/glove.6B/glove.6B.50d.txt','r')
        except:
            f = open('datasets/glove.6B/glove.6B.50d.txt','r')
        gloveModel = {}
        for line in f:
            splitLines = line.split()
            word = splitLines[0]
            wordEmbedding = np.array([float(value) for value in splitLines[1:]])
            gloveModel[word] = wordEmbedding
        print(len(gloveModel)," words loaded!\n")
        return(gloveModel)
    
    def getDocVector(self,doc_Id):
        gvl=self.getGloveVectorList(listx)
        glove_dv = np.mean(gvl,axis=0)
        return(glove_dv)
    
    def addDocumentGloveVector(self):
        vecL = []
        for indx in range(self.nDocs):
            listWords_1 = set(list(self.data['tfMatrix'][int(indx)]['word']))
            gvl=self.getGloveVectorList(listWords_1)
            vecL.append(np.mean(gvl,axis=0))
        self.data['glove-vector'] = vecL

    # distance between two documents using TF-IDF
    def distanceBtnTwoDocs(self, docId_1, docId_2):
        listWords_1 = set(list(self.data['tfMatrix'][int(docId_1)]['word']))
        listWords_2 = set(list(self.data['tfMatrix'][int(docId_2)]['word']))
        common = listWords_1.intersection(listWords_2)
        diff1_2 = listWords_1.difference(listWords_2)
        diff2_1 = listWords_2.difference(listWords_1)
        sumwt1 = self.data['sumTFIDF'][docId_1]
        sumwt2 = self.data['sumTFIDF'][docId_2]
        score_common, score_doc1, score_doc2 = 0,0,0
        #print(len(common),len(diff1_2),len(diff2_1))
        for word_c in common:
            score_1 = float(self.data['tfMatrix'][docId_1].loc[self.data['tfMatrix'][docId_1]['word'] == word_c]['tf-idf'])
            score_2 = float(self.data['tfMatrix'][docId_2].loc[self.data['tfMatrix'][docId_2]['word'] == word_c]['tf-idf'])
            score_common += abs(score_1/float(sumwt1) - score_2/float(sumwt2))
        for word_d12 in diff1_2:
            score_1 = float(self.data['tfMatrix'][docId_1].loc[self.data['tfMatrix'][docId_1]['word'] == word_d12]['tf-idf'])
            score_doc1 += score_1/float(sumwt1)
        for word_d21 in diff2_1:
            score_2 = float(self.data['tfMatrix'][docId_2].loc[self.data['tfMatrix'][docId_2]['word'] == word_d21]['tf-idf'])
            score_doc2 += score_2/float(sumwt2)
        score_total = score_common + score_doc1 + score_doc2
        return(score_total)
    
    #get gloVe vectors for all words in the document
    def getGloveVectorList(self,listx):
        vecList = []
        nf = []
        for w in listx:
            try:
                vecList.append(self.wordVecModel['glove'][w])
            except:
                nf.append(w)
                #print(w,"not found in glove model!")
                continue        
        if len(vecList)==0:
            return([[0]*50])
        vecArray = np.stack(vecList, axis=0)
        return vecArray
    
    #document vector is the average of all the word vectors gloVe
    def getDocVector(self,listx):
        gvl=self.getGloveVectorList(listx)
        glove_dv = np.mean(gvl,axis=0)
        return(glove_dv)
    
    def getGloveDistance(self,docId_1,docId_2,method):
        #listWords_1 = set(list(self.data['tfMatrix'][int(docId_1)]['word']))
        #listWords_2 = set(list(self.data['tfMatrix'][int(docId_2)]['word']))
        listWords_1 = set(list(self.data['tfMatrix'].iloc[int(docId_1)]['word']))
        listWords_2 = set(list(self.data['tfMatrix'].iloc[int(docId_2)]['word']))
        if method == 'average':
            dv_1 = self.getDocVector(listWords_1)
            dv_2 = self.getDocVector(listWords_2)
            #print("dv_1",dv_1)
            #print("dv_2",dv_2)
            dist = np.linalg.norm(dv_1-dv_2)
            return dist
              
    def drawProgressBar(self, percent, barLen = 50):			#just a progress bar so that you dont lose patience
        sys.stdout.write("\r")
        progress = ""
        for i in range(barLen):
            if i<int(barLen * percent):
                progress += "="
            else:
                progress += " "
        sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
        sys.stdout.flush()	

    #sample distance between n random documents 
    def getDistanceDistribution(self,numx,method):
        numHalf = int(numx/2)
        doca,docb = [],[]
        for i in range(numHalf):
            doca.append(random.randint(1,1026))
            docb.append(random.randint(1027,2053))
        distanceSample = []
        total = numHalf*numHalf
        for doc_1 in range(len(doca)):
            for doc_2 in range(len(docb)):
                if method == 'glove':
                    distanceSample.append(self.getGloveDistance(doca[doc_1],docb[doc_2],'average'))
                else:
                    distanceSample.append(self.getGloveDistance(doca[doc_1],docb[doc_2],'average'))
                cov = doc_1*numHalf + doc_2
                prog=(cov+1)/total
                self.drawProgressBar(prog)
        pltx = plot.hist(distanceSample,bins=20)
        return(pltx)
    
    def getGloveScore(self,w):
        try:
            return(self.wordVecModel['glove'][w])
        except:
            return([0*50]) 
    
    def doctfidf2vec(self,docId,mode):
        docVecList = []
        listWords = list(self.data['tfMatrix'][int(docId)]['word'])
        if mode == "tf-only":
            scores = list(self.data['tfMatrix'][int(docId)]['tf'])
        elif mode == "tf-idf":
            scores = list(self.data['tfMatrix'][int(docId)]['tf-idf'])
        lenW =len(listWords)
        gloveScores = [self.getGloveScore(el) for el in listWords]
        for j in range(lenW):
            temp = [float(scores[j])]*50
            #gloveScores[j]
            res = [a*b for (a,b) in zip(temp,gloveScores[j])]
            if len(res)==1:
                continue;
            else:
                docVecList.append(res)            
        #print([len(el) for el in docVecList])
        #vecArray = np.stack(docVecList, axis=0)
        return(np.mean(docVecList,axis=0))
    
    def createGroupedCharacteristics(self,column):
        self.dataTrain.groupby([column])
        print("\nComputing groupCharacteristics for GloVe!")
        self.groupedCharacteristic['glove-vector'] = self.dataTrain.groupby([column])['glove-vector'].apply(np.average).to_frame()
        print("\nComputing groupCharacteristics for doc2vec!")
        self.groupedCharacteristic['doc2vec'] = self.dataTrain.groupby([column])['doc2vec'].apply(np.average).to_frame()
        #print("\nComputing groupCharacteristics for tfidf-doc2vec!")
        #self.groupedCharacteristic['vec_tfidf-doc2vec'] = self.dataTrain.groupby([column])['vec_tfidf-doc2vec'].apply(np.average).to_frame()
        print("\nComputing groupCharacteristics for tfidf-GloVe!")
        self.groupedCharacteristic['vec_tfidf-glove'] = self.dataTrain.groupby([column])['vec_tfidf-glove'].apply(np.average).to_frame()
       
    def getNearestGroup(self,vec,vectorName):
        minDist = math.inf
        minGroup = None
        for colx in fdb.groupedCharacteristic[vectorName].index.values:
            vecy = fdb.groupedCharacteristic[vectorName].loc[colx].to_numpy(dtype=object)
            #distx = np.linalg.norm(vec-vecy)
            is_all_zero = np.all((vecy == 0.0))
            if not is_all_zero:
                distx = np.linalg.norm(scipy.spatial.distance.euclidean(vec,vecy))
            else:
                distx = np.linalg.norm(vec)
            print(distx)
            if distx<minDist:
                minDist = distx
                minGroup = colx                 
        return minGroup
    
    def splitTestTrain(self):
        mPt = int(self.nDocs*0.7)
        self.dataTrain = self.data[:mPt]
        self.dataTest = self.data[mPt:]
        self.nDocsTest = len(self.dataTest)
               
    def addVectorComputedGroup(self,vectorName,groupName):
        computedGroups = []
        for docId in range(self.nDocsTest):
            computedGroup = self.getNearestGroup(self.dataTest[vectorName].iloc[docId],vectorName)
            computedGroups.append(computedGroup)           
        self.dataTest[groupName] = computedGroups
        
        
    def getAccuracy(self,compareWith,vecName):
        countCorrect = 0
        for d in range(self.nDocsTest):
            if self.dataTest[vecName].iloc[d] == self.dataTest[compareWith].iloc[d]:
                countCorrect+=1
        print("Accuracy of",vecName,countCorrect/self.nDocsTest*100,"%")
            
    def tfidf2vec(self,mode,method):
        vecL = []
        if mode == 'tf-only':
            columnName = 'vec_tf-' + method
            print("\nComputing column:",columnName)
            for indx in range(self.nDocs):
                gvl=self.doctfidf2vec(indx,'tf-only')
                vecL.append(gvl)
                prog=(indx+1)/self.nDocs
                self.drawProgressBar(prog)
        else:
            columnName = 'vec_tfidf-' + method
            print("\nComputing column:",columnName)
            for indx in range(self.nDocs):
                gvl=self.doctfidf2vec(indx,'tf-idf')
                vecL.append(gvl)
                prog=(indx+1)/self.nDocs
                self.drawProgressBar(prog)
        self.data[columnName] = vecL

class vectorize:
    def __init__(self,data,factorName):
        self.data = data
        self.dataNew = []
        self.model = None
        self.swords = set(stopwords.words('english'))
        self.factorName = factorName
        for docId in range(len(self.data)):
            dv_1 = self.data[factorName][int(docId)]
            self.dataNew.append(dv_1)
        self.nDocs = len(self.dataNew)
        print(self.nDocs,"documents added!")
        
    def rem_stop_punct(self,originalText):
        splittedText = originalText.split()
        lenl = len(splittedText)
        wordFiltered = []
        tSent = []
        for r in range(lenl):
            wordx_1 = splittedText[r]
            wordx_2 = "".join(c for c in wordx_1 if c not in ('!','.',':',',','?',';','``','&','-','"','(',')','[',']','0','1','2','3','4','5','6','7','8','9')) 
            sWord = wordx_2.lower()
            if sWord not in self.swords:
                tSent.append(sWord)
        return tSent

    def tagged_document(self,list_of_list_of_words):
        for i, list_of_words in enumerate(list_of_list_of_words):
            yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

    def trainDocVectors(self):
        print("\nTraining doc2vec model.")
        self.data_for_training = list(self.tagged_document(self.dataNew))
        self.model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=30)
        self.model.build_vocab(self.data_for_training)
        self.model.train(self.data_for_training, total_examples=self.model.corpus_count, epochs=self.model.epochs)
        return(self.model)
        
    def addDocVectors(self):
        print("\nAdding doc2vec vectors to dataset.")
        docVectors = []
        for docId in range(len(self.data)):
            docVectors.append(self.model.infer_vector(self.rem_stop_punct(self.data[self.factorName][int(docId)])))
        self.data['doc2vec'] = docVectors
        
class flingDBSCAN:
    def __init__(self,data,epsilon,minPts,method):
        self.data = data
        self.method = method
        self.minPts = minPts
        self.noisePts = []
        self.nDocs = len(self.data)
        self.clusterCharacteristic = None 
        self.clusterIndex = 0 
        self.clusterCount = 0 
        self.clusterLabel = "computedCluster"
        print("\nflingDBSCAN initialized!\n")
        self.clusterMetadata = {}
        for i in range(self.nDocs):
            self.clusterMetadata[i] = None
        if epsilon:
            self.epsilon = epsilon
        else:
            if method == 'glove':
                self.epsilon = self.getBestDistance('glove')
                print("\nBest epsilon computed on GLOVE =",self.epsilon,"\n")
            else:
                self.epsilon = self.getBestDistance('tfidf')
                print("\nBest epsilon computed on GLOVE-TFIDF =",self.epsilon,"\n")
            
    def getBestDistance(self,method):
        numx = 100
        numHalf = int(numx/2)
        doca,docb = [],[]
        print("computing best distance")
        for i in range(numHalf):
            doca.append(random.randint(1,int(self.nDocs/2)))
            docb.append(random.randint(int(self.nDocs/2)+1,self.nDocs))
        distanceSample = []
        total = numHalf*numHalf
        for doc_1 in range(len(doca)):
            for doc_2 in range(len(docb)):
                if method == 'glove':
                    distanceSample.append(self.getDistance(doc_1,doc_2,'glove'))
                else:
                    distanceSample.append(self.getDistance(doc_1,doc_2,'tfidf'))
                cov = doc_1*numHalf + doc_2
                prog=(cov+1)/total
                self.drawProgressBar(prog)
        plt.show(plt.hist(distanceSample,bins=20))
        return statistics.mean(distanceSample)
            
    def assignLabel(self,dictDist,label):
        for el in dictDist:
            self.clusterMetadata[el]=label
            
    def printClusterInfo(self):
        print("Cluster characteristics:")
        print(" -- vectors:",self.method)
        print(" -- minPts:",self.minPts)
        print(" -- EstimatedBestDistance",self.epsilon)
        print(" --",self.clusterCount,"clusters formed!")
        print(" --",self.nDocs-len(self.noisePts),"points assigned to clusters!") 
        print(" --",len(self.noisePts),"noise points!\n")
        noisePc = len(self.noisePts)/self.nDocs*100
        print(" --",noisePc,"% noise!\n")
            
    def printClusterMetadata(self,n):
        for j in range(n):
            print(j, self.clusterMetadata[j])
         
    # range query equivalent function
    def findNeighborOf(self,ptIndex,method):
        distance = {}      
        #first vector
        if method == 'glove':
            dv_1 = self.data['glove-vector'][int(ptIndex)] 
        elif method == 'tfidf':
            dv_1 = self.data['tfidf2vec-tfidf'][int(ptIndex)]
        
        #iterating over the whole data for the second vector 
        if method == 'tfidf':
            for j in range(self.nDocs):
                dv_2 = self.data['tfidf2vec-tfidf'][j]
                if j!=ptIndex:
                    distx = self.getDistance(ptIndex,j,'tfidf')
                    distance[j] = distx
        elif method == 'glove':
            for j in range(self.nDocs):
                dv_2 = self.data['glove-vector'][j]
                if j!=ptIndex:
                    distx = self.getDistance(ptIndex,j,'glove')
                    distance[j] = distx
        
        # keeping only elements at a distnce of less than epsilon
        tempDistances = {key:value for (key,value) in distance.items() if value<self.epsilon}
        newDistances = {key:value for (key,value) in tempDistances.items() if self.clusterMetadata[key]==None}
        # keeping the cluster only if we 
        if len(newDistances)>self.minPts:    
            return newDistances.keys()
        else:
            return None
            
    def dbscanCompute(self):
        print("\ninitiating DBSCAN Clustering with",self.method,"vectors\n")
        self.clusterMetadata[0]='cluster_0_'
        for k in range(self.nDocs):
            if not self.clusterMetadata[k]:
                if self.method=='glove':
                    neighbors = self.findNeighborOf(k,'glove')
                else:
                    neighbors = self.findNeighborOf(k,'tfidf')
                if neighbors:
                    self.clusterCount+=1
                    clusterName = "cluster_" + str(self.clusterCount)+"_"
                    self.clusterMetadata[k] = clusterName
                    
                    # neighboring points of original point
                    for nbPoint in neighbors:
                        if not self.clusterMetadata[nbPoint]:
                            self.clusterMetadata[nbPoint] = clusterName
                    if self.method=='glove':
                        innerNeighbors = self.findNeighborOf(k,'glove')
                    else:
                        innerNeighbors = self.findNeighborOf(k,'tfidf')
                    if innerNeighbors:
                        for nb in innerNeighbors:
                            self.clusterMetadata[nb] = clusterName
                            neighbors.append(nb)                          
                    print("\n ---- ",clusterName,"assigned to",len(neighbors),"points! ----")
                else:
                    self.noisePts.append(k)
            prog=(k+1)/self.nDocs
            self.drawProgressBar(prog)
        print("\n",self.clusterCount,"clusters formed!")

            
    def getDistance(self,docId_1,docId_2,method):
        if method == 'glove':
            dv_1 = self.data['glove-vector'][int(docId_1)]
            dv_2 = self.data['glove-vector'][int(docId_2)]
        elif method == 'tfidf':
            dv_1 = self.data['tfidf2vec-tfidf'][int(docId_1)]
            dv_2 = self.data['tfidf2vec-tfidf'][int(docId_2)]           
        dist = np.linalg.norm(dv_1-dv_2)
        return dist
    
    def addClusterLabel(self,label):
        self.clusterLabel = label
        vec = []
        for el in self.clusterMetadata.keys():
            vec.append(self.clusterMetadata[el])
        self.data[label] = vec
        
    def getNearestGroup(self,vec):
        minDist = 100
        minGroup = None
        for colx in fdb.groupedCharacteristic.columns:
            vecy = fdb.groupedCharacteristic[colx]['glove-vector']
            distx = np.linalg.norm(vec-vecy)
            if distx<minDist:
                minDist = distx
                minGroup = colx
        return minGroup
    
    def addClusterMajorityLabel(self):
        clusterMap = {}
        for docId in range(self.nDocs):
            computedGroup = self.getNearestGroup(self.data['glove-vector'][int(docId)])
            clID = self.data['glove-vector'][int(docId)]
            if clID not in self.clusterCharacteristic:
                self.clusterCharacteristic[clID]=[computedGroup]
            else:
                self.clusterCharacteristic[clID].append(computedGroup)
            prog=(docId+1)/self.nDocs
            self.drawProgressBar(prog)
        for k in self.clusterCharacteristic.key():
            res = statistics.mode(self.clusterCharacteristic[k])
            clusterMap[k] = res
        return(clusterMap)

    def addVectorComputedGroup(self,vecName,factorName):
        computedGroups = []
        for docId in range(self.nDocs):
            computedGroup = self.getNearestGroup(self.data[vecName][int(docId)])
            computedGroups.append(computedGroup)
        self.data[factorName] = computedGroups
              
    def getAccuracy(self,compareWith):
        countCorrect = 0
        for d in range(self.nDocs):
            if self.data['characteristicGroup'][d] == self.data[compareWith][d]:
                countCorrect+=1
        print("Accuracy:",countCorrect/self.nDocs*100,"%")
        
    def evaluateClusterPerformance(self,compareWith):
        return(metrics.adjusted_rand_score(self.data['characteristicGroup'],self.data[compareWith]))
        
    def drawProgressBar(self, percent, barLen = 50):			#just a progress bar so that you dont lose patience
        sys.stdout.write("\r")
        progress = ""
        for i in range(barLen):
            if i<int(barLen * percent):
                progress += "="
            else:
                progress += " "
        sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
        sys.stdout.flush()	

In [36]:
for key in fdb.groupedCharacteristic.keys():
    print(fdb.groupedCharacteristic[key])
    
fdb.groupedCharacteristic

None
None
None
None


{'glove': None,
 'vec_tfidf-doc2vec': None,
 'vec_tfidf-glove': None,
 'doc2vec': None}

In [37]:
#from fling import utilities as ut
#from fling import tfidfModule as tfm
os.chdir("/Users/arnabborah/Documents/repositories/fling/")
spamtm = dataProcessor("datasets/spamTextMessages.csv",None)

In [38]:
# primary distance module run
ftf = flingTFIDF(spamtm.dataInitial,'Message')

In [39]:
ftf.smartTokenizeColumn()
ftf.getTF()
ftf.computeIDFmatrix()
ftf.getTFIDF()
ftf.createDistanceMetadata()

Adding term frequency column based on stopsRemoved
Computing list of words for IDF...

Created list of terms for IDF matrix with 8780  terms.

Computing global IDF matrix...

Computing and adding TF-IDF column based on stopsRemoved

In [96]:
# FRESH START
import gensim

#dataProcessed = pd.read_pickle('datasets/data_tfidf_processed.pkl')
fdb = flingPretrained(ftf.data)

#adding pretrained glove vectors 
fdb.loadPretrainedWordVectors('glove')
fdb.addDocumentGloveVector()

#traininf and adding doc2vec vectors
vecc = vectorize(fdb.data,'Message')
fdb.wordVecModel['doc2vec'] = vecc.trainDocVectors()
vecc.addDocVectors()

#adding combo vectors with tfidf and (glove + doc2vec) for inter sentence semantic information addition
fdb.tfidf2vec('tf-idf','glove')
#fdb.tfidf2vec('tf-idf','doc2vec')
fdb.splitTestTrain()

# train group characteristics on column 'category' and predict vector based category, and compute error
fdb.createGroupedCharacteristics('Category')
fdb.addVectorComputedGroup('glove-vector','cGroup_glove')
fdb.addVectorComputedGroup('doc2vec','cGroup_doc2vec')
fdb.addVectorComputedGroup('vec_tfidf-glove','cGroup_tfidf-glove')
#fdb.addVectorComputedGroup('vec_tfidf-doc2vec','cGroup_tfidf-doc2vec')
fdb.getAccuracy('Category','cGroup_glove')
fdb.getAccuracy('Category','cGroup_doc2vec')
fdb.getAccuracy('Category','cGroup_tfidf-glove')
#fdb.getAccuracy('Category','cGroup_tfidf-doc2vec')


DBSCAN initialized!

Loading Glove Model

400000  words loaded!

GloVe Vectors Loaded!

5572 documents added!

Training doc2vec model.

Adding doc2vec vectors to dataset.

Computing column: vec_tfidf-glove
Computing groupCharacteristics for GloVe!

Computing groupCharacteristics for doc2vec!

Computing groupCharacteristics for tfidf-GloVe!
23.842085671998344
23.26820166620397
36.8530727811163
36.44960576253381
33.15974431816349
32.764654369755895
34.865813162358194
34.46626404515776
26.217605482595943
25.697977452522522
26.380960199370094
25.851855688211916
29.09313597224676
28.636361703254583
32.173612312082824
31.76229852020855
33.00738069885386
32.614916584489904
29.38157065931997
28.913447370954657
29.54188426678488
29.097070539253632
26.92447491817617
26.424754356706654
26.819898933304714
26.2818825216383
30.8058392985419
30.364799442888287
30.223958835908263
29.782874295144772
26.35055218465449
25.831671543453787
26.123026409828913
25.604831878057965
32.69973461186322
32.3082412




27.997943198285412
27.513783726268453
32.62649439397131
32.20994631231925
24.721355154274644
24.160973638149557
18.928588191222904
18.17949493229047
23.915614227206582
23.342991091765597
23.137863273411025
22.556982302294358
29.62491706330293
29.1514119861465
29.784262305546882
29.32349040842932
29.444964816884386
28.98928674744534
32.46446855071046
32.03749877693367
28.97138729486754
28.495717529491873
28.074146257894142
27.588989020199016
24.604026284732324
24.049277570176294
31.135007471168326
30.69106105049564
31.637440404472578
31.205815581961275
38.24028709886667
37.88770336628334
27.907113384122
27.422552658434487
31.85670714087594
31.42480895074534
30.15706190473097
29.701558129863606
29.572662939595894
29.12967477540533
26.710046203075656
26.21186686495803
26.26756906838937
25.75166523638961
35.26419515796873
34.901884874455114
28.772658901817262
28.301726134755768
34.6713032395263
34.285856527697426
32.98114484341353
32.574764142380516
29.8325543519848
29.37739129993499
30.1

31.753553296054523
31.321861896969963
35.64814739945842
35.24359736108036
26.755313367675026
26.25845495662668
35.4161582452286
35.03708591125257
30.955555571310065
30.516479528333967
33.06955884397902
32.65677631510656
28.183660940393402
27.698834648461627
29.329150815766837
28.85578591705402
30.67372736084962
30.230225769987268
23.810532618649415
23.239500321324993
25.974609426014776
25.444425597644596
30.338919995662508
29.892548389302213
31.558154580780243
31.12547256747525
28.851178735264458
28.368763935549623
36.8530727811163
36.44960576253381
30.039793706407792
29.588197382618926
32.7139844684255
32.2895719911379
29.532592440929708
29.071368663425044
27.31633521400123
26.82263412191122
30.63957580137473
30.200833270102287
30.46345505103788
30.014237866170955
34.552090188961145
34.147427923935204
26.832806888994728
26.319790373704368
27.218033274551168
26.720456178712062
22.78954644690734
22.178061347029836
34.96310025493274
34.582706421795756
29.424551301634086
28.96038062174436

31.879757845241002
27.311527470031216
26.829786381639778
25.389605131322885
24.858866898362805
24.910154312268585
24.366809727742332
36.48864708012331
36.09124991086047
30.984661580187378
30.552384027593156
25.867011006954954
25.341316196452823
27.6821660238093
27.19451515662878
27.06375023349158
26.539675869761265
26.684751035052123
26.182683722295064
27.741398144112082
27.244029483742644
31.329624772952446
30.895820017652625
26.041640390055452
25.507699646306044
34.69618855259001
34.312901465810775
31.387446834195583
30.954412119327976
30.217404053611993
29.762646748242858
37.13128157781693
36.77873538573711
28.77829299350015
28.301580547558434
25.408667565636556
24.880581494134187
31.755415585409846
31.327896969567874
35.10105281737897
34.68071520993513
30.837807319691695
30.383474202310378
39.69858515990458
39.368404748631335
30.38671433523353
29.944790258678687
32.44344626970937
32.03141153003604
35.01284758879324
34.626074340808145
23.01754793269245
22.419090690469286
38.23556795

31.76126717061252
33.25542910163431
32.83801091223797
31.93364067208901
31.514883895701633
27.79622410885015
27.303718036248533
27.645691000429096
27.145991526687023
31.469988871083817
31.034499349242864
26.517250839378917
26.00427700878559
26.46121615144038
25.96879095653169
33.89269755667981
33.497675440860874
29.025118113297083
28.55762226608886
30.019978044985535
29.572943774235224
32.85039254542309
32.439590282171906
27.881449360467546
27.402599651995246
26.44961026921205
25.94160572905435
29.14921248700172
28.687662329292404
32.23693433946119
31.812234521702308
28.266068858414247
27.781028919959486
29.096095466085142
28.613362372698276
29.171915659530008
28.709219618629344
31.023722710218554
30.590090070476254
29.137840715798
28.664691414202192
37.75481138277904
37.403932018171396
33.65694623122521
33.262927264152346
32.94524806702498
32.53098258309199
27.96349563604659
27.467468245426772
31.332780309046694
30.905847761140045
35.28603945997543
34.90800253766231
25.881653833843263

28.518430565632027
32.538976237359755
32.124526654861604
30.658233365149172
30.22303519796613
24.760883529609888
24.206009020062705
27.77598581326188
27.2900664569147
28.233302983924087
27.747013160799025
26.73882899392251
26.24710584993838
30.608005377058635
30.17467774879258
32.111321345097124
31.690977797673206
28.581679516906924
28.101857743402473
29.041068121011886
28.56916600501242
22.61574810408759
22.00005111980515
32.16768768693388
31.759784910737547
37.69172557431821
37.34009475023565
26.222005409405295
25.710110787595735
30.473380562650004
30.018678277857045
26.830801431771135
26.31603790422207
28.33949537383847
27.859807594998934
32.72533138438319
32.31923887042664
30.688270231774165
30.25200128415045
32.09518055872008
31.65929010136111
32.132057076316066
31.694969344539434
26.747707631242502
26.233422013312083
30.35653814708989
29.91664761295725
25.52328108507532
24.9855075883692
25.950845293183754
25.423689261371923
30.20645903414409
29.7555852543901
32.73778079697321
32.

24.23295729594682
24.640869967391936
24.095192540024257
36.62463383482078
36.279919105165675
25.783306270589858
25.245380706528508
31.621524304756292
31.200518378064334
30.795687346020053
30.34901056312539
28.677846251478936
28.205526522232947
29.101530590305664
28.64731735933891
25.657431663010502
25.133388950767284
36.35526615977023
35.99198225834343
35.88061179311758
35.50449892382365
33.019354765919765
32.611305742891766
30.55277629309497
30.098785287538202
27.182376024705764
26.66953833019128
25.87566532248814
25.34822742565913
27.810808536369112
27.342945651147136
36.91055951688543
36.54767161832404
29.971631799293874
29.51596089722302
28.7183174988472
28.244360311989954
35.682091197960546
35.30650082792662
31.867095438208036
31.437628004012662
25.18879319261226
24.647937416501367
31.52834467496331
31.08494722138646
26.288180564997813
25.755881712774322
33.09905128460655
32.724217992678405
27.305847886922884
26.80558665312097
29.854531829117423
29.39783387797237
33.31847264786013

0.65885794
1.5267168
0.68684727
1.5557873
0.6772033
1.5445617
0.66728604
1.5370138
2.9909217
3.2841995
3.24148
3.5291896
0.6482722
1.5237852
0.6830739
1.5511435
2.4861565
2.8155394
0.678516
1.5469736
0.66033953
1.5265727
0.6641802
1.5283303
0.6690202
1.5242753
0.6730821
1.5403923
0.6591812
1.5366576
0.6549159
1.5276239
0.6753527
1.5463657
0.6619576
1.5293787
3.1809647
3.4457426
0.68989843
1.5516144
0.66589546
1.5406411
2.1610458
2.5578694
3.175067
3.4338686
0.6437631
1.5194075
2.3312845
2.6876478
0.6675722
1.5251181
8.63546
8.732061
2.6266556
2.966371
0.6675192
1.5408987
0.6643567
1.5330212
0.6534645
1.5338368
0.6509473
1.5218195
0.6838646
1.544483
0.66694874
1.5218064
3.8268883
4.0329237
0.66844344
1.5317733
0.6748816
1.5291656
0.6668691
1.5262802
4.113797
4.309382
3.536179
3.7668207
1.8903818
2.3311453
0.66573256
1.528343
2.7676597
3.0774214
4.5234838
4.718069
5.8932557
6.0221953
0.67501915
1.5321949
0.66756105
1.5387233
0.6670903
1.5384688
0.67223996
1.532876
0.6502214
1.5236048
6.5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



1.5338131
0.6683765
1.5422198
4.248428
4.4299026
2.903687
3.1761878
0.661328
1.5322832
0.6648938
1.5358363
2.4124756
2.7539723
0.65662223
1.5211625
0.66403055
1.5281435
2.069285
2.4669116
0.6570565
1.5271785
0.67147875
1.5395948
0.6747044
1.5389624
2.2087457
2.5810502
0.6554456
1.5306612
0.6755168
1.5410606
2.8234813
3.1272283
7.556224
7.6541457
0.66746765
1.5362556
0.64969355
1.5131781
0.6682578
1.5296313
0.6757997
1.534034
2.4100585
2.7526765
0.6460375
1.5181655
1.2391576
1.8352219
0.67736745
1.5397125
0.6729486
1.5374744
3.424965
3.6599584
0.66808164
1.5311283
0.6564456
1.528294
2.1279829
2.513191
0.6444512
1.5156407
3.1857107
3.4378033
0.66591024
1.5370744
0.66135436
1.5259874
6.81445
6.940391
0.6618562
1.5303694
3.131892
3.3919995
0.65838516
1.529342
0.6549847
1.5340898
0.6782973
1.5428503
0.66014135
1.5357375
0.6687095
1.5466443
0.64812404
1.5174872
0.64463484
1.5258919
0.66859645
1.5303297
0.67538637
1.5318519
0.6645819
1.5247
0.6834838
1.5447313
3.523418
3.7552369
0.6620992
1.

2.8446608
0.658975
1.5274307
0.6661822
1.5301383
0.68345326
1.5547129
7.348823
7.4601445
2.4346168
2.7822247
2.3551762
2.7058024
0.6693922
1.533879
5.7686973
5.8983665
0.663358
1.5349125
2.7317827
3.0285623
0.6517101
1.5281094
0.66665727
1.5305557
0.6703347
1.5324727
0.6735641
1.5354996
0.66164505
1.5215158
0.68082315
1.5445787
0.67095095
1.5332088
0.6752882
1.538503
2.481137
2.81965
0.66931295
1.5315709
4.228458
4.413574
2.3455505
2.6980102
0.6485903
1.5228037
0.665037
1.51966
7.0337934
7.1747074
0.66400355
1.5350635
0.6683698
1.5362977
0.67059296
1.52278
6.7732663
6.891237
0.6571054
1.5264323
0.6641587
1.5299308
0.66838735
1.5398374
0.65998214
1.5305879
0.6546687
1.5254022
0.65076256
1.522465
0.6782973
1.5428503
0.6665276
1.526766
0.65769416
1.5246942
0.66742265
1.5386342
0.6539253
1.5091316
3.0650423
3.3314443
0.6547131
1.5209527
0.66535366
1.5361674
0.65888655
1.5223137
0.66400856
1.5305663
0.6714229
1.5330858
0.6760891
1.5455705
0.66070503
1.5281836
0.6683011
1.5425872
0.66555554


1.5263747
0.65600705
1.5224204
2.1784425
2.5594544
0.65850145
1.5211077
0.6660597
1.5120707
2.9039028
3.1877217
0.66567755
1.5289277
0.6649291
1.5363888
0.6641731
1.5346524
0.66446185
1.5359707
0.663057
1.5355287
0.65558714
1.5248827
0.6770249
1.5379989
0.65844244
1.5314287
3.058774
3.3391426
6.9980474
7.1448874
0.6774192
1.5312942
0.65220493
1.5133699
7.8697696
7.9838614
0.67655945
1.5388753
0.6560428
1.532188
5.805868
5.94056
0.66490775
1.5208932
3.8271723
4.0326247
0.65824145
1.5268991
0.66135275
1.5267947
3.4155102
3.6754217
0.67366153
1.537002
0.6706317
1.5323071
0.6658661
1.530917
4.259946
4.428712
0.670762
1.5364112
0.6572999
1.5257319
3.400892
3.6658359
5.2345853
5.3787937
0.65791327
1.5283207
0.6515389
1.5190147
2.3458982
2.7204752
2.3720908
2.7006965
0.6724445
1.5316237
0.6673209
1.5315073
0.6779859
1.5380683
4.363519
4.539798
0.66799814
1.5337445
0.66075546
1.5303816
0.659954
1.5382924
0.67000633
1.528159
0.6482431
1.5144053
0.6456977
1.5200964
0.69088995
1.5552036
3.3311124

1.5224828
0.6782973
1.5428503
0.6524292
1.5137938
0.6644538
1.5292732
0.6636762
1.5399643
0.66834784
1.5405581
0.6721444
1.5398804
0.67181736
1.5380149
0.66926587
1.5482548
0.66750175
1.5390007
0.6679487
1.5363579
0.6670077
1.5361075
0.66661966
1.5331478
0.6659486
1.5308768
0.66920465
1.5182348
0.65021044
1.5198139
2.269494
2.6426291
4.0671086
4.2766666
2.7521625
3.0559185
0.663057
1.5355287
0.6586617
1.5257053
0.6643931
1.5333849
0.6667038
1.5277725
0.6620507
1.525537
0.6829734
1.5363481
0.67505866
1.5403215
0.6639721
1.5354979
0.65635645
1.5190234
6.8741813
7.0014644
0.65654534
1.5232857
0.6560625
1.5267315
0.65792084
1.5293208
0.6682144
1.5400702
0.6799387
1.5447918
0.6652285
1.5298973
0.66995436
1.5312748
2.2275863
2.6191971
0.662437
1.5282063
0.66751844
1.5407449
3.103925
3.3854601
0.6645636
1.5251672
0.6680654
1.5293386
0.677568
1.549218
0.664202
1.5276101
0.6590526
1.5234078
2.6074991
2.9237645
0.65466475
1.5306653
0.68048537
1.5521857
0.6568947
1.5244529
0.6710605
1.5420882
0.6

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [77]:
fdb.dataTest

Unnamed: 0,Category,Message,stopsRemoved,tfMatrix,sumTFIDF,glove-vector,doc2vec,vec_tfidf-glove,cGroup_glove,cGroup_doc2vec,cGroup_tfidf-glove
3900,ham,Ã mean it's confirmed... I tot they juz say o...,ã mean confirmed tot juz say oni ok,word tf tf-idf 0 ã 1 2...,18.529606,"[-0.07874371428571428, -0.07750728571428571, 0...","[-0.00029004653, 0.0072122235, 0.0012765152, -...","[-0.033669516307554326, -0.1668532815387253, 0...",ham,ham,
3901,ham,Okie,okie,word tf tf-idf 0 okie 1 2.490739,2.490739,"[-1.0514, -0.7125, -0.32979, -1.5782, -0.70277...","[0.0032638062, 0.000998768, -0.006656086, 0.00...","[-2.6187625668247585, -1.7746512543871416, -0....",spam,spam,
3902,ham,That depends. How would you like to be treated...,depends would like treated,word tf tf-idf 0 depends 1 2.842...,9.400800,"[0.5856725, -0.025622499999999993, -0.07667800...","[0.0071007977, 0.008866012, -0.006232733, 0.00...","[1.4380022035117963, -0.10766734072048496, -0....",ham,ham,
3903,ham,"Right on brah, see you later",right brah see later,word tf tf-idf 0 right 1 1.806492 ...,8.250930,"[0.41043999999999997, 0.8718945, 0.611363, -0....","[-0.0032308905, -0.0007188073, 0.0057696486, 0...","[1.1441046094744052, 2.590689262783445, 1.9755...",ham,ham,
3904,ham,Waiting in e car 4 my mum lor. U leh? Reach ho...,waiting e car mum lor u leh reach home already,word tf tf-idf 0 waiting 1 2.021...,18.849881,"[0.3746515, 0.3813084, 0.7807202999999999, 0.0...","[-0.034012422, -0.061670527, 0.015993552, 0.06...","[0.7524747499755884, 0.7093350163164593, 1.480...",ham,ham,
...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u â£ pound prize cla...,word tf tf-idf 0 ...,29.685673,"[-0.0032673076923077126, 0.2802763076923076, 0...","[0.04626287, -0.12486149, 0.010979156, 0.03321...","[-0.018272115681935517, 0.38059669250977396, 0...",ham,spam,
5568,ham,Will Ã¼ b going to esplanade fr home?,ã¼ b going esplanade fr home,word tf tf-idf 0 ã¼ 1 1...,12.328684,"[0.2450002, 0.433446, -0.009058, -0.1098192, 0...","[-0.0032531489, -0.09509008, 0.011011036, 0.04...","[0.7474427145489011, 1.10828246067343, -0.1888...",ham,ham,
5569,ham,"Pity, * was in mood for that. So...any other s...",pity * mood soany suggestions,word tf tf-idf 0 pity ...,15.080331,"[-0.16283274999999997, 0.44291, -0.20726499999...","[0.15559773, -0.024660442, 0.09976124, 0.00577...","[-0.10259994304988762, 1.0048869796671016, -0....",ham,spam,
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like i'd interested buying ...,word tf tf-idf 0 guy ...,32.770129,"[0.1625725923076923, -0.08889784615384616, 0.0...","[0.0029246681, -0.00042458618, 0.0025144704, -...","[0.4332410359621821, -0.43844977348643754, 0.1...",ham,spam,


In [58]:
fdb.groupedCharacteristic['doc2vec'].loc['ham'].to_numpy(dtype=object)

array([array([ 0.00145649, -0.01432053,  0.01307351, -0.00200391, -0.00830727,
        0.02941629, -0.00999487, -0.00073129, -0.00437657,  0.01471183,
        0.00707789,  0.01578464,  0.00084015,  0.00189465, -0.00029905,
        0.01071829,  0.00323365, -0.00443153, -0.01489976, -0.01932186,
        0.01500085, -0.01300158,  0.00967808, -0.00938102, -0.01180961,
        0.00983463, -0.00319925,  0.00618966, -0.00259511,  0.00234092,
        0.00247946, -0.02748282,  0.00033443, -0.01787248,  0.00927453,
       -0.01269134, -0.01494397, -0.01469267, -0.00249837, -0.00466744,
        0.00638671, -0.00415108,  0.01221796, -0.01780416, -0.00916571,
       -0.0219495 , -0.00583187,  0.02274373, -0.0055441 ,  0.00030472],
      dtype=float32)], dtype=object)

In [69]:
a = fdb.dataTest['glove-vector'].iloc[1]
b = fdb.groupedCharacteristic['doc2vec'].loc['ham'].to_numpy(dtype=object)
print(a,type(a))
print(b,type(b))
import scipy
print(scipy.spatial.distance.cosine(a,b))
print(np.linalg.norm(scipy.spatial.distance.cosine(a,b)))
#computedGroup = self.getNearestGroup(self.dataTest[vectorName].iloc[docId],vectorName)

[-1.0514   -0.7125   -0.32979  -1.5782   -0.70277  -0.17696   0.35851
 -0.25738  -0.45986   0.69583  -0.59439  -0.071654  0.99636  -0.37387
  0.55014  -0.55981   0.52405   0.58494   0.66762   0.84464  -0.27754
  0.093692 -0.47661  -0.14204   0.56345   0.3176   -0.20212   0.38231
  0.032153 -0.69724  -1.3483   -0.78941   0.20827   0.61859  -0.65964
  0.050033 -0.021429 -1.4299    1.0405    0.42354  -0.79416   0.19858
  0.047546  0.082816 -0.7052   -0.40723   0.39191   0.18906  -0.58996
 -0.033718] <class 'numpy.ndarray'>
[array([ 0.00710858, -0.01191311,  0.01180887,  0.01427133, -0.00948243,
        0.02567787, -0.01396888,  0.00585956, -0.02231221, -0.00297664,
        0.01101326,  0.00572352,  0.00294924,  0.00215183, -0.00795537,
        0.00165394,  0.00600958,  0.00063951, -0.01772013, -0.00849582,
       -0.00011211, -0.00393122, -0.00595212, -0.00040865, -0.01173587,
       -0.00106305, -0.01495443,  0.00643836, -0.00229097,  0.01362749,
       -0.00711745, -0.01913031,  0.00014

In [73]:
fdb.data

Unnamed: 0,Category,Message,stopsRemoved,tfMatrix,sumTFIDF,glove-vector,doc2vec,vec_tfidf-glove
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,word tf tf-idf 0 go 1 ...,38.281443,"[0.21390625000000005, 0.3857445625, -0.1334233...","[0.04005482, -0.06680761, 0.02545498, 0.024805...","[0.5959587370342363, 0.8768489600169621, -0.51..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,word tf tf-idf 0 ok 1 1.31950...,12.583182,"[-0.34427266666666667, -0.11794016666666667, 0...","[-0.014016584, -0.035048068, 0.011611654, 0.03...","[-0.6469232570438529, -0.4567087086249299, -0...."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st...,word tf tf-idf 0 entry ...,49.524838,"[-0.3973114285714286, 0.43085399999999996, -0....","[0.0028259559, 0.0051632347, 0.0085555585, 0.0...","[-1.3248417366882783, 1.1032396880662387, -0.5..."
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,word tf tf-idf 0 u 2 1.669...,16.431526,"[0.17517428571428573, 0.24041571428571432, 0.2...","[-0.01914546, -0.09453999, 0.019697232, 0.0757...","[0.5123484443079068, 0.5753741094651186, 0.311..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,word tf tf-idf 0 nah 1 2.70461...,16.678825,"[0.19229857142857143, 0.4842861428571427, 0.19...","[-0.005010659, 0.00058875955, 0.0060600745, -0...","[0.47549546283727345, 1.2971284798171407, 0.48..."
...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u â£ pound prize cla...,word tf tf-idf 0 ...,29.685673,"[-0.0032673076923077126, 0.2802763076923076, 0...","[-0.012644604, -0.062778056, 0.013478121, 0.03...","[-0.018272115681935517, 0.38059669250977396, 0..."
5568,ham,Will Ã¼ b going to esplanade fr home?,ã¼ b going esplanade fr home,word tf tf-idf 0 ã¼ 1 1...,12.328684,"[0.2450002, 0.433446, -0.009058, -0.1098192, 0...","[-0.023012374, -0.085845396, 0.046025265, 0.04...","[0.7474427145489011, 1.10828246067343, -0.1888..."
5569,ham,"Pity, * was in mood for that. So...any other s...",pity * mood soany suggestions,word tf tf-idf 0 pity ...,15.080331,"[-0.16283274999999997, 0.44291, -0.20726499999...","[0.14092924, -0.011391963, 0.114448994, 0.1104...","[-0.10259994304988762, 1.0048869796671016, -0...."
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like i'd interested buying ...,word tf tf-idf 0 guy ...,32.770129,"[0.1625725923076923, -0.08889784615384616, 0.0...","[0.0029246681, -0.00042458618, 0.0025144704, -...","[0.4332410359621821, -0.43844977348643754, 0.1..."
