In [1]:
import pymongo
from multiprocessing import Pool

In [26]:
from preprocessor.preprocessing import PreprocessPostContent

In [3]:
client = pymongo.MongoClient(host='10.1.1.9', port=50000)
javaposts_db = client.javaposts
posts_collection = javaposts_db.posts
posts = posts_collection.find()

In [4]:
print(posts.count())

390000


  """Entry point for launching an IPython kernel.


In [5]:
def process(post_bulk):
    wordlist_list = []
    for post in post_bulk:
        wordlist_list.append(PreprocessPostContent().get_single_para_word_list(post['Title']))
        wordlist_list.extend(PreprocessPostContent().get_mul_para_wordlist_list(post['Body']))

    return wordlist_list

def parallel_process(posts):
    bulk = []
    block_list = []
    count = 0
    pool = Pool(50)
    for post in posts:
        bulk.append(post)
        count += 1
        if not count % 1000:
            block_list.append(bulk)
            bulk = []
    
    block_list.append(bulk)
    return_list = pool.map(process, block_list)
    pool.close()
    pool.join()
    return return_list

In [6]:
# used about 5 min
return_list = parallel_process(posts)
wordlist_list = []

for ls in return_list:
    wordlist_list.extend(ls)

In [7]:
len(wordlist_list)

1372909

In [8]:
from gensim.models import Word2Vec

In [9]:
num_features = 1000  # 词向量维度
min_word_count = 1  # 最小词频数目
num_workers = 40  # 并行线程
context = 10  # Context window size

# Initialize and train the model (this will take some time)

model = Word2Vec(wordlist_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "1000features"
model.save(model_name)

In [10]:
model.wv.vocab

{'How': <gensim.models.keyedvectors.Vocab at 0x7f174fdd41d0>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7f165f837fd0>,
 'where': <gensim.models.keyedvectors.Vocab at 0x7f16608a2470>,
 'can': <gensim.models.keyedvectors.Vocab at 0x7f16608a24e0>,
 'I': <gensim.models.keyedvectors.Vocab at 0x7f16608a2588>,
 'start': <gensim.models.keyedvectors.Vocab at 0x7f16608a2550>,
 'learning': <gensim.models.keyedvectors.Vocab at 0x7f16608a2518>,
 'AI': <gensim.models.keyedvectors.Vocab at 0x7f16608a25c0>,
 'if': <gensim.models.keyedvectors.Vocab at 0x7f16608a25f8>,
 'have': <gensim.models.keyedvectors.Vocab at 0x7f16608a2630>,
 'considerable': <gensim.models.keyedvectors.Vocab at 0x7f16608a2668>,
 'knowledge': <gensim.models.keyedvectors.Vocab at 0x7f16608a26a0>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f16608a26d8>,
 'Java': <gensim.models.keyedvectors.Vocab at 0x7f16608a2710>,
 'C++': <gensim.models.keyedvectors.Vocab at 0x7f16608a2748>,
 'deep': <gensim.models.keyedvectors.Vocab at 0x7

In [21]:
model.wv.most_similar("java")

[('Java', 0.691421389579773),
 ('JAVA', 0.6243633031845093),
 ('flex', 0.516598105430603),
 ('c++', 0.5053383708000183),
 ('groovy', 0.5041947364807129),
 ('python', 0.4993615746498108),
 ('scala', 0.49760356545448303),
 ('c#', 0.4690357446670532),
 ('clojure', 0.46818214654922485),
 ('netbeans', 0.46812400221824646)]

In [12]:
# idf计算的输入 ["Should be sentences","so should using it",] 
sentences = []
for wordlist in wordlist_list:
    sentences.append(" ".join(wordlist))

In [13]:
sentences

['How and where can I start learning AI if I have considerable knowledge of Java C++',
 'I have deep interest in AI and want to start learning how to implement current method I know about java and C++ are these languages sufficient',
 'If I lag some knowledge which is required before starting AI then please let me know',
 'Learning Artificial Intelligence with Python vs Java',
 "I am University student taking an Artificial Intelligence class this semester Our Professor's programming language of choice is Java but it seems that perhaps with some nudging he can change it to Python",
 "I wanted to know if there is any merit in doing so I know programming language is just programming language however given the industry's wide use of Python when implementing AI algorithms I think it makes much more sense for us to use Python It will be easier to transition from University environment to an Industrial one having done several assignments in Python and having clear understanding of all the too

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# corpus = ["This is very strange",
#           "This is very nice",
#          ]
vectorizer = TfidfVectorizer(
                        use_idf=True, # utiliza o idf como peso, fazendo tf*idf
                        norm=None, # normaliza os vetores
                        smooth_idf=False, #soma 1 ao N e ao ni => idf = ln(N+1 / ni+1)
                        sublinear_tf=False, #tf = 1+ln(tf)
                        binary=False,
                        min_df=1, max_df=1.0, max_features=None,
                        ngram_range=(1,1), preprocessor=None,
                        stop_words=None, 
                        tokenizer=None,
                        vocabulary=None
             )
X = vectorizer.fit_transform(sentences)
idf = vectorizer.idf_
idf_dict = dict(zip(vectorizer.get_feature_names(), idf))
len(idf_dict)

206154

构建一个小的子集进行原型开发 
`model` 和 `idf_dict`

In [43]:
# formula (1) in III.A  rel(q->Q)
def calculate_asymmetric(model, q_word_list, Q_word_list, idf_metric_dict):
    rel_idf_list = [0]
    idf_list = [0]
    for q_word in q_word_list:
        rel_list = [0]
        for Q_word in Q_word_list:
            try:
                val = model.wv.similarity(q_word, Q_word)
            except:
                val = 0
            rel_list.append(val)

        try:
            idf = idf_metric_dict[q_word]
        except:
            idf = 0

        rel_idf_list.append(max(rel_list) * idf)
        idf_list.append(idf)

    sum_idf = sum(idf_list)
    if not sum_idf:
        rel_q2Q = 0
    else:
        rel_q2Q = sum(rel_idf_list) / sum_idf

    return rel_q2Q

# rel(q, Q)
def calculate_symmetric(model, q_word_list, Q_word_list, idf_metric_dict):
    rel_q2Q = calculate_asymmetric(model, q_word_list, Q_word_list, idf_metric_dict)
    rel_Q2q = calculate_asymmetric(model, Q_word_list, q_word_list, idf_metric_dict)
    return (rel_q2Q + rel_Q2q) / 2

In [44]:
class Post():

    def __init__(self, title_word_list, answer_list):
        self.title_word_list = title_word_list
        self.answer_list = answer_list
        self.score = 0

    def calculate_score(self, model, query_list, idf_metric_dict):
        self.score = calculate_symmetric(model, query_list, self.title_word_list, idf_metric_dict)
        
    def __lt__(self, other):
        return self.score < other.score

In [45]:
posts = posts_collection.find()
posts.count()

  


390000

In [46]:
post_obj_list_1000 = []
count = 0
for post in posts:
    post_obj_list_1000.append(Post(PreprocessPostContent().get_single_para_word_list(post['Title']), post['answers']))
    count += 1
    if count == 1000:
        break

len(post_obj_list_1000)
# post_obj_list_1000[1].title_word_list

1000

In [47]:
post_obj_list_1000[1].title_word_list

['Learning', 'Artificial', 'Intelligence', 'with', 'Python', 'vs', 'Java']

In [152]:
query = "java vs python"
query_word_list = PreprocessPostContent().get_single_para_word_list(query)
for post_obj in post_obj_list_1000:
    print(post_obj.title_word_list)
    post_obj.calculate_score(model, query_word_list, idf_dict)

['How', 'to', 'learn', 'Java', 'Webservices']
['Java', 'Best', 'Place', 'to', 'Begin', 'Learning', 'Basic', 'Networking']
['Java', 'Programatic', 'Way', 'to', 'Determine', 'Current', 'Windows', 'User']
['Converting', 'List', 'to', 'List']
['Alternatives', 'to', 'System.exit']
['How', 'do', 'I', 'learn', 'Java5', 'or', 'Java6']
['Weka', 'java', 'API', 'Attribute', 'Selection', 'and', 'Cross', 'Validation']
['Why', 'learn', 'Perl', 'Python', 'Ruby', 'if', 'the', 'company', 'is', 'using', 'C++', 'C#', 'or', 'Java', 'as', 'the', 'application', 'language']
['How', 'to', 'start', 'learning', 'JAVA', 'for', 'use', 'with', 'Oracle', 'RDBMS']
['Thinking', 'of', 'learning', 'Maven']
['How', 'and', 'where', 'can', 'I', 'start', 'learning', 'AI', 'if', 'I', 'have', 'considerable', 'knowledge', 'of', 'Java', 'C++']
['Getting', 'java', 'and', 'flash', 'to', 'talk', 'to', 'each', 'other']
['When', 'should', 'you', 'use', 'java', 'stored', 'procedures', 'with', 'an', 'Oracle', 'database', '...', 'what

  if __name__ == '__main__':


['How', 'can', 'you', 'run', 'Javascript', 'using', 'Rhino', 'for', 'Java', 'in', 'sandbox']
['How', 'do', 'I', 'use', 'Java', 'to', 'read', 'from', 'file', 'that', 'is', 'actively', 'being', 'written', 'to']
['Propagation', 'of', 'Oracle', 'Transactions', 'Between', 'C++', 'and', 'Java']
['What', 'does', 'either', 'Java', 'GUI', 'editor', 'offer', 'for', 'rapid', 'development', 'and', 'maintainability']
['How', 'should', 'exceptions', 'be', 'planned', 'at', 'the', 'architectural', 'level']
['How', 'to', 'call', 'Java', 'code', 'from', 'C#']
['Read', 'write', 'to', 'Windows', 'registry', 'using', 'Java']
['How', 'to', 'implement', 'simple', 'auto-complete', 'functionality']
['How', 'can', 'I', 'measure', 'distance', 'and', 'create', 'bounding', 'box', 'based', 'on', 'two', 'latitude+longitude', 'points', 'in', 'Java']
['How', 'do', 'I', 'in', 'java', 'add', 'stacktrace', 'to', 'my', 'debugging', 'printout']
['Which', 'EJB', 'NUM', 'persisent', 'provider', 'should', 'I', 'use']
['How', 

["What's", 'the', 'syntax', 'for', 'mod', 'in', 'java']
['How', 'can', 'I', 'send', 'an', 'email', 'by', 'Java', 'application', 'using', 'GMail', 'Yahoo', 'or', 'Hotmail']
['How', 'do', 'you', 'add', 'button', 'to', 'the', 'email', 'message', 'window', 'toolbar', 'in', 'Lotus', 'Notes', '8.5']
['Java', 'API', 'to', 'generate', 'Java', 'source', 'files']
['How', 'to', 'avoid', 'OutOfMemoryError', 'when', 'using', 'Bytebuffers', 'and', 'NIO']
['Can', 'I', 'convert', 'the', 'following', 'code', 'to', 'use', 'generics']
['How', 'to', 'reduce', 'javax.faces.ViewState', 'in', 'JSF']
['How', 'do', 'you', 'maintain', 'java', 'webapps', 'in', 'different', 'staging', 'environments']
['If', 'you', 'have', 'Java', 'application', 'that', 'is', 'consuming', 'CPU', 'when', 'it', "isn't", 'doing', 'anything', 'how', 'do', 'you', 'determine', 'what', 'it', 'is', 'doing']
['How', 'can', 'I', 'Java', 'webstart', 'multiple', 'dependent', 'native', 'libraries']
['Java', 'Compilation', 'Is', 'there', 'way',

['Why', "doesn't", 'Java', 'autoboxing', 'extend', 'to', 'method', 'invocations', 'of', 'methods', 'of', 'the', 'autoboxed', 'types']
['How', 'to', 'remove', 'debug', 'statements', 'from', 'production', 'code', 'in', 'Java']
['How', 'to', 'get', 'whois', 'information', 'of', 'domain', 'name', 'in', 'my', 'program']
['How', 'do', 'I', 'format', 'number', 'in', 'Java']
['How', 'do', 'I', 'find', 'out', 'what', 'type', 'each', 'object', 'is', 'in', 'ArrayList']
['java', 'get', 'file', 'size', 'efficiently']
['Charting', 'library', 'for', 'Java', 'and', 'Net']
['What', 'is', 'the', 'best', 'way', 'to', 'do', 'distributed', 'transactions', 'across', 'multiple', 'databases', 'using', 'Spring', 'and', 'Hibernate']
['Physical', 'Address', 'in', 'JAVA']
['How', 'do', 'I', 'get', 'InputVerifier', 'to', 'work', 'with', 'an', 'editable', 'JComboBox']
['How', 'can', 'I', 'draw', 'curve', 'that', 'varies', 'in', 'thickness', 'along', 'its', 'path']
['How', 'do', 'you', 'ensure', 'multiple', 'threads

['Easiest', 'way', 'to', 'merge', 'release', 'into', 'one', 'JAR', 'file']
['What', 'is', 'the', 'best', 'way', 'to', 'load', 'Hibernate', 'object', 'graph', 'before', 'using', 'it', 'in', 'UI']
['How', 'do', 'I', 'use', 'JUnitPerf', 'with', 'JWebUnit', 'and', 'JUnit']
['Is', 'there', 'Java-based', 'ray', 'tracing', 'model', 'that', 'can', 'be', 'adapted', 'for', 'use', 'in', 'underwater', 'acoustics']
['Hidden', 'Features', 'of', 'Java']
['Ant', 'Junit', 'tests', 'are', 'running', 'much', 'slower', 'via', 'ant', 'than', 'via', 'IDE', 'what', 'to', 'look', 'at']
['What', 'is', 'the', 'best', 'way', 'to', 'present', 'data', 'from', 'very', 'large', 'resultset']
['Scanner', 'cannot', 'be', 'resolved', 'to', 'type']
['How', 'do', 'you', 'remotely', 'update', 'Java', 'applications']
['How', 'do', 'you', 'tell', 'whether', 'string', 'is', 'an', 'IP', 'or', 'hostname']
['How', 'to', 'accept', 'REF', 'cursor', 'in', 'JAVA', 'without', 'importing', 'Oracle', 'Package']
['How', 'to', 'retrieve'

['How', 'do', 'you', 'generate', 'and', 'analyze', 'thread', 'dump', 'from', 'running', 'JBoss', 'instance']
['Static', 'Analysis', 'tool', 'recommendation', 'for', 'Java']
['How', 'do', 'I', 'return', 'NUM', 'Forbidden', 'in', 'Spring', 'MVC']
['Linux', 'commands', 'from', 'Java']
['Does', 'it', 'matter', 'which', "vendor's", 'JDK', 'you', 'build', 'with']
['How', 'do', 'I', 'set', 'the', 'HttpOnly', 'flag', 'on', 'JSF', 'Richfaces']
['When', 'to', 'choose', 'checked', 'and', 'unchecked', 'exceptions']
['How', 'do', 'I', 'load', 'an', 'org.w3c.dom.Document', 'from', 'XML', 'in', 'string']
['Blank', 'Page', 'in', 'JSF']
['How', 'do', 'you', 'fix', 'Too', 'many', 'open', 'files', 'problem', 'in', 'Hudson']
['UI', 'design', 'alternatives', 'with', 'Groovy', 'JRuby', 'Jython', 'or', 'other', 'JVM', 'languages']
['How', 'do', 'you', 'handle', 'developer', 'individual', 'files', 'under', 'version', 'control']
['pl', 'sql', 'java', 'creating', 'dynamic', 'query']
['Differences', 'between', '

['Getting', 'mail', 'from', 'GMail', 'into', 'Java', 'application', 'using', 'IMAP']
['Starting', 'process', 'with', 'inherited', 'stdin', 'stdout', 'stderr', 'in', 'Java', 'NUM']
['Why', 'am', 'I', 'getting', 'NoClassDefFoundError', 'in', 'Java']
['Reading', 'an', 'ASCII', 'file', 'with', 'FileChannel', 'and', 'ByteArrays']
['Good', 'clustering', 'Java', 'library']
['ArrayList', 'in', 'Java', 'and', 'inputting']
['How', 'do', 'I', 'recover', 'from', 'an', 'unchecked', 'exception']
['Get', 'IFile', 'from', 'IWorkspaceRoot', 'and', 'location', 'String']
['Any', 'clever', 'ways', 'of', 'handling', 'the', 'context', 'in', 'web', 'app']
['Multi-armed', 'bandit', 'algorithms', 'in', 'Java']
['Java', 'ConnectionPool', 'connection', 'not', 'closing', 'stuck', 'in', 'sleep']
['Appropriate', 'design', 'pattern', 'for', 'an', 'event', 'log', 'parser']
['Java', 'Time', 'Zone', 'is', 'messed', 'up']
['Java', 'and', 'manually', 'executing', 'finalize']
["What's", 'the', 'nearest', 'substitute', 'fo

['Memory', 'footprint', 'issues', 'with', 'JAVA', 'JNI', 'and', 'application']
['Turn', 'an', 'array', 'of', 'pixels', 'into', 'an', 'Image', 'object', 'with', "Java's", 'ImageIO']
['Is', 'Xorshift', 'RNG', 'good', 'enough', 'for', 'Monte', 'Carlo', 'approaches', 'If', 'not', 'what', 'alternatives', 'are', 'there']
['get', 'OS-level', 'system', 'information']
['Indirectly', 'referenced', 'from', 'required', 'class', 'file']
['test', 'attribute', 'in', 'JSTL', 'tag']
['Before', 'and', 'After', 'Suite', 'execution', 'hook', 'in', 'jUnit', '4.x']
['Keyword', 'for', 'the', 'outer', 'class', 'from', 'an', 'anonymous', 'inner', 'class']
['Difference', 'between', 'int', '[]', 'array', 'and', 'int', 'array', '[]']
['Java', 'Serialization', 'with', 'non', 'serializable', 'parts']
['ADF', 'business', 'components', 'through', 'RMI', 'vs', 'EJB', 'and', 'Toplink']
['Java', 'Collections', 'using', 'wildcard']
['Does', 'Tiles', 'for', 'Struts2', 'support', 'UTF-8', 'encoded', 'templates']
['JUnit', 

['Is', 'Bouncy', 'Castle', 'API', 'Thread', 'Safe']
['Get', 'Methods', 'One', 'vs', 'Many']
['Sequence', 'Diagram', 'Reverse', 'Engineering']
['Phantom', 'Referenced', 'Objects']
['OSGi', 'Testing']
['Biggest', 'GWT', 'Pitfalls']
['Test', 'Distribution']
['Virtual', 'Machine', 'Optimization']
['Updating', 'Android', 'Tab', 'Icons']
['JPA', 'Multiple', 'Transaction', 'Managers']
['Getting', 'Spring', 'Application', 'Context']
['Is', 'Project', 'Darkstar', 'Realistic']
['Why', "aren't", 'Enumerations', 'Iterable']
['Best', 'Apache', 'Ant', 'Template']
['Tomcat', 'vs', 'Weblogic', 'JNDI', 'Lookup']
['Security', 'For', 'Voting', 'Application']
['OSCache', 'vs', 'EHCache']
['Recommended', 'Source', 'Control', 'Directory', 'Structure']
['Apache', 'Axis', 'ConfigurationException']
['Unrooted', 'Tests']
['Best', 'StAX', 'Implementation']
['Method', 'Local', 'Inner', 'Class']
['Generic', 'Method', 'Type', 'Safety']
['JComboBox', 'Selection', 'Change', 'Listener']
['Large', 'File', 'Download']
[

In [153]:
post_obj_list_1000.sort(reverse=True)

In [154]:
for i in range(10):
    print(str(post_obj_list_1000[i].title_word_list) + str(post_obj_list_1000[i].score))

['Java', 'Logging', 'vs', 'Log4J']0.8427421238761538
['Learning', 'Artificial', 'Intelligence', 'with', 'Python', 'vs', 'Java']0.7742961572968659
['SAX', 'vs', 'XmlTextReader', 'SAX', 'in', 'C#']0.7535447278262272
['Get', 'Methods', 'One', 'vs', 'Many']0.7439130046563265
['JUnit', 'vs', 'TestNG']0.7429130618391463
['Tomcat', 'vs', 'Weblogic', 'JNDI', 'Lookup']0.736205519770776
['OSCache', 'vs', 'EHCache']0.7240180284547915
['Weka', 'java', 'API', 'Attribute', 'Selection', 'and', 'Cross', 'Validation']0.5767983696168854
['Iterators', 'in', 'C++', 'vs', 'Java', 'is', 'there', 'conceptual', 'difference']0.5578987917974823
['Scala', 'vs', 'Java', 'if', "you're", 'NOT', 'going', 'to', 'use', 'Spark']0.5541921651850947


In [155]:
class Paragraph():
    def __init__(self, raw_text, word_list, vote_score, position):
        self.raw_text = raw_text # 带标签的，只经过<p>分段的数据
        self.word_list = word_list
        self.position = position # 第几段
        self.vote_score = vote_score
        self.relevance_score = 0
        self.entity_score = 0
        self.infor_entropy = 0
        self.semantic_pattern = 0
        self.format_pattern = 0
        self.pos_score = 0
        self.overall_score = 0

    def cal_relevance(self, model, query_word_list, idf_metric_dict):
        self.relevance_score = calculate_symmetric(model, query_word_list, self.word_list, idf_metric_dict)

    def cal_entropy(self, idf_metric_dict):
        idf_list = []
        for word in self.word_list:
            try:
                idf = idf_metric_dict[word]
            except:
                idf = 0

            idf_list.append(idf)

        self.infor_entropy = sum(idf_list)

    def cal_semantic_pattern(self):
        pattern = ['please check', 'pls check', 'you should',
                   'you can try', 'you could try', 'check out',
                   'in short', 'the most important', 'i\'d recommend',
                   'in summary', 'keep in mind', 'i suggest']

        lower_plain_text = self.raw_text.lower()

        for p in pattern:
            if lower_plain_text.find(p) != -1:
                self.semantic_pattern = 1
                break

    def cal_format_pattern(self):
        pattern = ['<strong>', 'strike']

        lower_plain_text = self.raw_text.lower()

        for p in pattern:
            if lower_plain_text.find(p) != -1:
                self.format_pattern = 1
                break

    def cal_pos_score(self):
        if self.position >= 1 and self.position <= 3:
            self.pos_score = 1 / self.position
        else:
            self.pos_score = 0


    def normalized(self, relevance_min, relevance_max, entropy_min, entropy_max, vote_min, vote_max):
        self.relevance_score = (self.relevance_score - relevance_min) / (relevance_max - relevance_min)
        self.infor_entropy = (self.infor_entropy - entropy_min) / (entropy_max - entropy_min)
        self.vote_score = (self.vote_score - vote_min) / (vote_max - vote_min)

    def cal_overall_score(self):
        self.overall_score += self.vote_score
        self.overall_score += self.relevance_score
        self.overall_score += self.entity_score
        self.overall_score += self.infor_entropy
        self.overall_score += self.semantic_pattern
        self.overall_score += self.format_pattern
        self.overall_score += self.pos_score
        
    def __gt__(self, other):
        return self.overall_score > other.overall_score

In [156]:
# 选择前5的Post
preprocessor = PreprocessPostContent()
paragraph_obj_list = []
for i in range(5):
    answer_list = post_obj_list_1000[i].answer_list
    print(len(answer_list))
    for answer in answer_list:
        paragraph_list = preprocessor.getParagraphs(answer['Body'])
        print("---" + str(len(paragraph_list)))
        for pos, para in enumerate(paragraph_list, 1):
            paragraph_obj_list.append(Paragraph(para, preprocessor.get_single_para_word_list(para), vote_score=answer['Score'], position=pos))


2
---7
---1
2
---3
---1
2
---2
---8
2
---1
---3
2
---4
---3


In [157]:
relevance_list = []
entropy_list = []
vote_score_list = []
for para_obj in paragraph_obj_list:
    para_obj.cal_relevance(model, query_word_list, idf_dict)
    para_obj.cal_entropy(idf_dict)
    para_obj.cal_semantic_pattern()
    para_obj.cal_format_pattern()
    para_obj.cal_pos_score()
    relevance_list.append(para_obj.relevance_score)
    entropy_list.append(para_obj.infor_entropy)
    vote_score_list.append(para_obj.vote_score)


for para_obj in paragraph_obj_list:
    para_obj.normalized(min(relevance_list), max(relevance_list), min(entropy_list), max(entropy_list), min(vote_score_list), max(vote_score_list))
    para_obj.cal_overall_score()

  if __name__ == '__main__':


In [158]:
paragraph_obj_list.sort(reverse=True)
print(len(paragraph_obj_list))
for i in range(5):
    print(" ".join(paragraph_obj_list[i].word_list))

33
I've used both but I have to agree with Justin Standard that you shouldn't really consider rewriting your existing tests to any new format Regardless of the decision it is pretty trivial to run both TestNG strives to be much more configurable than JUnit but in the end they both work equally well
However I don't believe if you are starting from scratch you should learn to develop models in Java While the underlying material is the same all you are doing is wasting time IMO learning language that is rarely used in the field and lacks sufficient support by most modern ML frameworks Also while this is rather subjective python feels better when used as data science language Working with data especially is where that difference is clear to me
I'm going to go ahead and disagree with the others From an academic perspective for AI or any CS related assignment Java will always have much more benefit as you will get to write the actual code instead of using libraries others have already writte

In [159]:
class Element():
    def __init__(self, val, pos_i, pos_j):
        self.val = val
        self.pos_i = pos_i
        self.pos_j =pos_j
    
    def __gt__(self, other):
        return self.val > other.val

In [160]:
n = 10
element_obj_list = []
for i in range(n):
    for j in range(i+1, n):
        sym_relevance = calculate_symmetric(model, paragraph_obj_list[i].word_list, paragraph_obj_list[j].word_list, idf_dict)
        element_obj_list.append(Element(sym_relevance, i, j))        

  if __name__ == '__main__':


In [161]:
element_obj_list.sort()
for i in range(len(element_obj_list)):
    print(str(i) + ":" + str(element_obj_list[i].val) + "(" + str(element_obj_list[i].pos_i) + "," + str(element_obj_list[i].pos_j) + ")")

0:0.23490978979040922(4,5)
1:0.25139217136315517(1,4)
2:0.28032632771295896(0,4)
3:0.2864641327637235(4,6)
4:0.28761822411811144(3,4)
5:0.29427786973334635(4,7)
6:0.29761951162680017(4,8)
7:0.3191869978002683(2,4)
8:0.3274806612288059(4,9)
9:0.34868076384507074(3,8)
10:0.36216607148235674(3,5)
11:0.3630996678739559(5,6)
12:0.38472380574134146(1,5)
13:0.38605428681932225(5,8)
14:0.39370728651541054(1,9)
15:0.40098928738812323(7,8)
16:0.41386331747150645(2,5)
17:0.41680630094761545(5,9)
18:0.417322425191211(2,3)
19:0.4181668706740461(8,9)
20:0.4241131744319436(1,3)
21:0.42470373909070513(0,5)
22:0.42600862712396975(0,3)
23:0.4351343034304094(0,8)
24:0.43743652663859606(6,8)
25:0.4388338127150432(3,7)
26:0.43980214085573965(7,9)
27:0.4405482179060748(2,6)
28:0.4485390380737897(0,1)
29:0.45689688146351365(2,9)
30:0.45931061873852425(3,9)
31:0.4612013383417487(1,6)
32:0.4638264515495382(1,7)
33:0.46431668095548806(2,8)
34:0.46517252224997135(1,2)
35:0.46743031313729727(3,6)
36:0.46840027269

In [162]:
mmr_return_set = set()

In [163]:
for i in range(len(element_obj_list)):
    mmr_return_set.add(element_obj_list[i].pos_i)
    mmr_return_set.add(element_obj_list[i].pos_j)
    if len(mmr_return_set) >= 5:
        break
print(mmr_return_set)

{0, 1, 4, 5, 6}


In [164]:
for i in mmr_return_set:
    print(" ".join(paragraph_obj_list[i].word_list))

I've used both but I have to agree with Justin Standard that you shouldn't really consider rewriting your existing tests to any new format Regardless of the decision it is pretty trivial to run both TestNG strives to be much more configurable than JUnit but in the end they both work equally well
However I don't believe if you are starting from scratch you should learn to develop models in Java While the underlying material is the same all you are doing is wasting time IMO learning language that is rarely used in the field and lacks sufficient support by most modern ML frameworks Also while this is rather subjective python feels better when used as data science language Working with data especially is where that difference is clear to me
Why not overload the getEmployeeName method
I'd say you're probably fine with util.logging for the needs you describe
TestNG has neat feature where you can mark tests as particular group and then easily run all tests of specific group or exclude tests o