# Measuring similarity between Text: Cosine Similarity Measure

### Installations
##### pip install nltk
##### pip install sklearn

### Preparing nltk

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aniket\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Importing other packages

In [15]:
import string
from sklearn.feature_extraction.text import TfidfVectorizer

### Cosine similarity function

In [16]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

# remove punctuation, lowercase, stem
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0, 1]


### Code to load csv file 

In [17]:
# getRows(), takes the file name and returns the list of topics from the dataset
import csv


def getRows(filename):
    topics = []
    with open(filename, mode='r', encoding='utf8', errors='ignore') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                # print(f'Column names are {", ".join(row)}')
                line_count += 1
            topics.append(row["Topic"])
            # print(row["Topic"])
            line_count += 1
        print(f'Processed {line_count} lines.')
        return topics



In [30]:
if __name__ == '__main__':
    print('Loading file...')
    # loading dataset.csv file
    filename = 'dataset.csv'  # name of the file to be loaded 
    topic_list = getRows(filename)

    input_string = input('Enter the search string: ')

    cosine_list = {}
    for i, x in enumerate(topic_list):
        cosine_list.update({i: cosine_sim(x, input_string)})
    sorted_list = sorted(cosine_list.items(), key=lambda x: x[1], reverse=True)

    # print the top 10 results along with their cosine similarity
    print('Top 10 match results: ')
    print("%15s" % ("Topic") + ":  %10s"%("Cosine Sim Number"))
    for x in sorted_list[:10]:
        print("%30s" %(topic_list[x[0]]) + ':  %10s'% (str(x[1])))


Loading file...
Processed 44 lines.
Enter the search string: can you predict my house price?
Top 10 match results: 
          Topic:  Cosine Sim Number
        House Price Prediction:  1.0000000000000002
        Predicting House Price:  1.0000000000000002
Stock Price prediction using Knn:  0.3563004293331381
           Rainfall Prediction:  0.2605556710562624
       Stock Market Prediction:  0.20199309249791833
      Diabetes Risk Prediction:  0.20199309249791833
          Precting stock price:  0.20199309249791833
      Diabetes Risk Prediction:  0.20199309249791833
       Flight Delay Prediction:  0.20199309249791833
  IMDB Movie Rating Prediction:  0.17077611319011649
