In [242]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd


def convert_tag(tag):
    """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None


def doc_to_synsets(doc):
    """
    Returns a list of synsets in document.

    Tokenizes and tags the words in the document doc.
    Then finds the first synset for each word/tag combination.
    If a synset is not found for that combination it is skipped.

    Args:
        doc: string to be converted

    Returns:
        list of synsets

    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
    """
    

    # Your Code Here
    word = nltk.word_tokenize(doc)
    pos_tag = nltk.pos_tag(word)

    syn = []
    for token, wordnet_tag in pos_tag:
        wordnet_tag = convert_tag(pos_tag)
        synsets = wn.synsets(token,wordnet_tag)
        if len(synsets)!=0:
            syn.append(synsets[0])
    
    return syn


def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """
    
    # Your Code Here
    sim = []
    for item in s1:
        sim.append(max([item.path_similarity(items) for items in s2 if item.path_similarity(items) is not None]),default=0)
    
    return sum(sim)/len(sim)



def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

In [235]:
from nltk.corpus import wordnet as wn

s1 = doc_to_synsets('I like dogs')
s2 = doc_to_synsets('I like cats')
sim = []

for items in s1: 
    all_sim = []
    for items in s2:
        simi = item.path_similarity(items)
        if simi is not None: 
            all_sim.append(simi)
    if len(all_sim) !=0:
        sim.append(max(all_sim))

                
                
            
                       
           


[0.2, 0.2, 0.2]

In [141]:
phrase = pd.read_csv('/Users/emilyvincett/downloads/paraphrases.csv')
phrase

Unnamed: 0,Quality,D1,D2
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an..."
1,1,After more than two years' detention under the...,After more than two years in detention by the ...
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec..."
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H..."
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...
5,1,Their difference was over whether the court sh...,Their difference was over whether the court sh...
6,1,The only announced Republican to replace Davis...,So far the only declared major party candidate...
7,1,"Druce will face murder charges, Conte said.",Conte said Druce will be charged with murder.\n
8,0,"""It's a major victory for Maine, and it's a ma...",The Maine program could be a model for other s...
9,1,Microsoft said Friday that it is halting devel...,Microsoft will stop developing versions of its...


In [243]:
def most_similar_docs():
    simi = []
    for i in phrase.index:
        similarity = document_path_similarity(phrase['D1'][i],phrase['D2'][i])
        simi.append(similarity)
    phrase['similarity'] = simi
    highest = phrase[phrase['similarity']==phrase['similarity'].max()]
    return highest['D1'][phrase['similarity'].argmax()],highest['D2'][phrase['similarity'].argmax()],highest['similarity'][phrase['similarity'].argmax()]
most_similar_docs()

TypeError: append() takes no keyword arguments

In [5]:
def label_accuracy():
    
    from sklearn.metrics import accuracy_score
    simi = []
    for i in phrase.index:
        similarity = document_path_similarity(phrase['D1'][i],phrase['D2'][i])
        simi.append(similarity)
    phrase['similarity'] = simi
    phrase['classifier'] = np.where(phrase['similarity'] > 0.75,1,0)
    return accuracy_score(phrase['Quality'],phrase['classifier'])

label_accuracy()

0.7

In [76]:
import pickle
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Load the list of documents
with open('/Users/emilyvincett/downloads/newsgroups', 'rb') as f:
    newsgroup_data = pickle.load(f)

# Use CountVectorizor to find three letter tokens, remove stop_words, 
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
X = vect.fit_transform(newsgroup_data)

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [92]:
from gensim import corpora, models 
ldamodel = gensim.models.ldamodel.LdaModel(corpus,num_topics=10, id2word=id_map, passes=25, random_state=34)

In [112]:
def lda_topics():
    return ldamodel.show_topics(num_topics=10,num_words=10)
lda_topics()

[(0,
  '0.056*"edu" + 0.043*"com" + 0.033*"thanks" + 0.022*"mail" + 0.021*"know" + 0.020*"does" + 0.014*"info" + 0.012*"monitor" + 0.010*"looking" + 0.010*"don"'),
 (1,
  '0.024*"ground" + 0.018*"current" + 0.018*"just" + 0.013*"want" + 0.013*"use" + 0.011*"using" + 0.011*"used" + 0.010*"power" + 0.010*"speed" + 0.010*"output"'),
 (2,
  '0.061*"drive" + 0.042*"disk" + 0.033*"scsi" + 0.030*"drives" + 0.028*"hard" + 0.028*"controller" + 0.027*"card" + 0.020*"rom" + 0.018*"floppy" + 0.017*"bus"'),
 (3,
  '0.023*"time" + 0.015*"atheism" + 0.014*"list" + 0.013*"left" + 0.012*"alt" + 0.012*"faq" + 0.012*"probably" + 0.011*"know" + 0.011*"send" + 0.010*"months"'),
 (4,
  '0.025*"car" + 0.016*"just" + 0.014*"don" + 0.014*"bike" + 0.012*"good" + 0.011*"new" + 0.011*"think" + 0.010*"year" + 0.010*"cars" + 0.010*"time"'),
 (5,
  '0.030*"game" + 0.027*"team" + 0.023*"year" + 0.017*"games" + 0.016*"play" + 0.012*"season" + 0.012*"players" + 0.012*"win" + 0.011*"hockey" + 0.011*"good"'),
 (6,
  '0.0

In [12]:
new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
Krumins\n-- "]

In [113]:
def topic_distribution():
    Y = vect.transform(new_doc)
    corpus_2 = gensim.matutils.Sparse2Corpus(Y,documents_columns=False)

    ldamodel.show_topics(num_words=10) 
    return (ldamodel.get_document_topics(corpus_2))[0]
topic_distribution()

[(0, 0.020003106),
 (1, 0.02000333),
 (2, 0.020001281),
 (3, 0.49648553),
 (4, 0.020004045),
 (5, 0.020004135),
 (6, 0.020002974),
 (7, 0.020002646),
 (8, 0.020003127),
 (9, 0.3434898)]

In [130]:
labels = ['Health', 
          'Science', 
          'Automobiles', 
          'Politics', 
          'Government', 
          'Travel', 
          'Computers & IT', 
          'Sports', 
          'Business', 
          'Society & Lifestyle', 
          'Religion, Education']

topics = lda_topics()
results = []
for _,x in topics:
    sim = []
    for topic in labels:
        sim.append(document_path_similarity(x,topic))
    match = sorted(zip(sim,labels))[-1][1]
    results.append(match)
results

['Religion, Education',
 'Sports',
 'Sports',
 'Politics',
 'Automobiles',
 'Sports',
 'Religion, Education',
 'Society & Lifestyle',
 'Religion, Education',
 'Science']