In [1]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import sys
import os
import datetime

In [2]:
from Vocab_class import TermVocab
TVocab = TermVocab()

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/harsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
coll_dir = '20news-bydate-test'

In [60]:
class genSimGraph:
    def __init__(self, coll_dir, op_file):
        self.coll_dir = coll_dir
        self.op_file = op_file
        self.doc_term_dict = None
        self.vocab = TermVocab()
        self.limit_file_read_to = 10
        self.file_count = 0
        
    def generate_term_set(self):
        limit_file_read_to = self.limit_file_read_to
        coll_dir = self.coll_dir
        doc_term_dict = {}
        print('Creating Term set for docs')
        ps = PorterStemmer()
        file_count = 0
        read_file_count = 0
        for foldername in os.listdir(coll_dir):
            print('For Folder:', foldername)
            path_to_file = os.path.join(coll_dir, foldername)
            for filename in os.listdir(path_to_file):
                file_count+=1
                #print('---------------------For File:', filename)
                print('file:', foldername+'/'+filename,end = ' ')
                with open(os.path.join(path_to_file, filename), 'r') as f:
                    content = None
                    try:
                        content = f.read()
                    except:
                        print('cant read',foldername +'/'+filename)
                    read_file_count +=1
                    #print(content)
                    doc_terms = word_tokenize(content)
                    #print(doc_terms)
                    stemmed_terms = [ps.stem(term) for term in doc_terms]
                    #print('non unique',len(stemmed_terms))
                    #print('uniq', len(set(stemmed_terms)))
                    doc_term_dict[foldername+'/'+filename] = set(stemmed_terms)
                    [self.vocab.add_term(term) for term in stemmed_terms]
                    #TVocab.add_term(stemmed_terms[0])
                    #print(stemmed_terms)
                if file_count>=limit_file_read_to:
                    break
            if file_count>=limit_file_read_to:
                break
        print(file_count)
        self.file_count = file_count
        print('Read File Count:', read_file_count)
        self.doc_term_dict = doc_term_dict
        return doc_term_dict
    
    def write_to_file(self, doc1, doc2, sim, path = None):
        if path == None:
            path = self.op_file
        content = doc1 + '\t' + doc2 + '\t' + str(sim) + '\n'
        with open(path, '+a') as file:
            file.write(content)
        return True
    
    def clear_file(self, filename):
        open(filename, 'w').close()
        return True
    
    def gen_vocab(self):
        limit_file_read_to = self.limit_file_read_to
        coll_dir = self.coll_dir
        idf = {}
        self.vocab = TermVocab()
        print('Generating vocab')
        ps = PorterStemmer()
        file_count = 0
        read_file_count = 0
        for foldername in os.listdir(coll_dir):
            print('For Folder:', foldername)
            path_to_file = os.path.join(coll_dir, foldername)
            for filename in os.listdir(path_to_file):
                file_count+=1
                #print('---------------------For File:', filename)
                print('file:', foldername+'/'+filename,end = ' ')
                with open(os.path.join(path_to_file, filename), 'r') as f:
                    content = None
                    try:
                        content = f.read()
                    except:
                        print('cant read',foldername +'/'+filename)
                    
                    read_file_count +=1
                    #print(content)
                    doc_terms = word_tokenize(content)
                    #print(doc_terms)
                    stemmed_terms = set([ps.stem(term) for term in doc_terms])
                    doc_term_dict[foldername+'/'+filename] = set(stemmed_terms)
                    for term in stemmed_terms:
                        self.vocab.add_term(term)
                        if term in idf:
                            idf[term] +=1
                        else:
                            idf[term] = 1
                    #TVocab.add_term(stemmed_terms[0])
                    #print(stemmed_terms)
                if file_count>=limit_file_read_to:
                    break
            if file_count>=limit_file_read_to:
                break
        D = file_count
        for term in idf:
            idf[term] = np.log(1 +D/idf[term]) 
        print(file_count)
        self.file_count = file_count
        return idf
    
    def make_doc_vectors(self):
        limit_file_read_to = self.limit_file_read_to
        coll_dir = self.coll_dir
        doc_term_dict = {}
        tf = {}
        idf = self.gen_vocab()
        n = self.vocab.vocab_length
        print('Creating vector for docs')
        ps = PorterStemmer()
        file_count = 0
        read_file_count = 0
        for foldername in os.listdir(coll_dir):
            print('For Folder:', foldername)
            path_to_file = os.path.join(coll_dir, foldername)
            for filename in os.listdir(path_to_file):
                file_count+=1
                #print('---------------------For File:', filename)
                print('file:', foldername+'/'+filename,end = ' ')
                with open(os.path.join(path_to_file, filename), 'r') as f:
                    content = f.read()
                    read_file_count +=1
                    
                    #tokenize and stem
                    doc_terms = word_tokenize(content)
                    if len(doc_terms) < 1:
                        continue
                    stemmed_terms = [ps.stem(term) for term in doc_terms]
                    
                    #make tf vector for docs
                    doc_v = np.zeros(n)
                    for term in stemmed_terms:
                        doc_v[self.vocab.to_index(term)] += 1
                    doc_v = np.ma.log(doc_v)
                    tf[foldername+'/'+filename] = 1 + doc_v.filled(0)
                    
                    #include idf part in the vector
                    for term_index in range(self.vocab.vocab_length):
                        term = self.vocab.to_term(term_index)
                        tf[foldername+'/'+filename][term_index] *= idf[term]
                    
                if file_count>=limit_file_read_to:
                    break
            if file_count>=limit_file_read_to:
                break
        print(file_count)
        return tf
        
    def jaccard_sim(self):
        doc_term_dict = self.doc_term_dict
        self.clear_file(self.op_file)
        n = len(doc_term_dict)
        jacob_mat = np.zeros((n,n))
        for i, itr_i in zip(doc_term_dict, range(n)):
            for j, itr_j in zip(doc_term_dict, range(n)):
                if itr_j <= itr_i:
                    continue
                term_dic_i = doc_term_dict[i]
                term_dic_j = doc_term_dict[j]
                if len(term_dic_i)>0 or len(term_dic_j)>0:
                    nr = len(term_dic_i & term_dic_j)
                    dr = len(term_dic_i | term_dic_j)
                    jacob_mat[itr_i, itr_j] = nr/dr
                    if jacob_mat[itr_i, itr_j] != 0:
                        self.write_to_file(i, j, jacob_mat[itr_i, itr_j])
                itr_j +=1
            itr_i +=1
        return jacob_mat
    
    def cosine_sim(self, d1, d2):
        sim = np.dot(d1, d2)/(np.linalg.norm(d1) * np.linalg.norm(d2))
        return sim
    
    def tf_idf_sim(self):
        doc_vectors = self.make_doc_vectors()
        n = self.vocab.vocab_length
        m = self.file_count
        print('file count:', m)
        sim_mat = np.zeros((m,m))
        for d1, i in zip(doc_vectors, range(m)):
            for d2, j in zip(doc_vectors, range(m)):
                if j <= i:
                    continue
                sim = self.cosine_sim(doc_vectors[d1], doc_vectors[d2])
                sim_mat[i,j] = sim
                if sim > 0:
                    self.write_to_file(d1, d2, sim, 'tf_idf_sim.txt')
        print(sim_mat)
        return True
            

In [61]:
g_sim = genSimGraph(coll_dir, 'sim_op.txt')

In [62]:
doc_term_dict = g_sim.generate_term_set()

Creating Term set for docs
For Folder: sci.space
file: sci.space/61444 file: sci.space/61440 file: sci.space/61396 file: sci.space/61363 file: sci.space/61321 file: sci.space/61414 file: sci.space/62114 file: sci.space/62405 file: sci.space/61522 file: sci.space/61416 10
Read File Count: 10


In [63]:
g_sim.jaccard_sim()

array([[0.        , 0.20178042, 0.18181818, 0.16060606, 0.10365854,
        0.23611111, 0.13141026, 0.10309278, 0.11552347, 0.09430894],
       [0.        , 0.        , 0.17567568, 0.18897638, 0.124     ,
        0.15062762, 0.13333333, 0.13207547, 0.13432836, 0.07441016],
       [0.        , 0.        , 0.        , 0.22556391, 0.12132353,
        0.14122137, 0.12547529, 0.12820513, 0.125     , 0.09608541],
       [0.        , 0.        , 0.        , 0.        , 0.12608696,
        0.17674419, 0.1627907 , 0.14136126, 0.13812155, 0.08778626],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.11538462, 0.10628019, 0.09444444, 0.13496933, 0.09145129],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.10945274, 0.13017751, 0.1474359 , 0.07100592],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.11904762, 0.13548387, 0.06930693],
       [0.        , 0.        , 0.       

In [64]:
g_sim.tf_idf_sim()

Generating vocab
For Folder: sci.space
file: sci.space/61444 file: sci.space/61440 file: sci.space/61396 file: sci.space/61363 file: sci.space/61321 file: sci.space/61414 file: sci.space/62114 file: sci.space/62405 file: sci.space/61522 file: sci.space/61416 10
Creating vector for docs
For Folder: sci.space
file: sci.space/61444 file: sci.space/61440 file: sci.space/61396 file: sci.space/61363 file: sci.space/61321 file: sci.space/61414 file: sci.space/62114 file: sci.space/62405 file: sci.space/61522 file: sci.space/61416 10
file count: 10
[[0.         0.97727218 0.96927141 0.97953964 0.97335732 0.98065062
  0.97766819 0.97736422 0.97835111 0.9145893 ]
 [0.         0.         0.97795757 0.98943328 0.98456061 0.98943724
  0.98879728 0.98859438 0.99002432 0.92500254]
 [0.         0.         0.         0.98010821 0.97467118 0.97933804
  0.97898665 0.97865866 0.97945428 0.91735388]
 [0.         0.         0.         0.         0.98762272 0.99283332
  0.99184685 0.99191846 0.9931889  0.929

True

In [None]:
import scipy

In [66]:
from sknetwork.ranking import PageRank

ModuleNotFoundError: No module named 'sknetwork'