In [1]:
import string
import pandas as pd
from scipy.sparse import csr_matrix

Source: 

- [How to remove punctuation marks from a string in Python 3.x using .translate()?
](https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate)
- [List of English Stop Words](http://xpo6.com/list-of-english-stop-words/) though I have not used them yet
- [Udacity Bag of Words Tutorial](https://www.youtube.com/watch?v=NXbR9GQbtnk&t=0s&list=PLAwxTw4SYaPkQXg8TkVdIvYv4HfLG7SiH&index=317)

In [2]:
def remove_punctuations(sentence):
    translator = sentence.maketrans('', '', string.punctuation)
    return sentence.translate(translator)

remove_punctuations('Hello it\,s me!')

'Hello its me'

In [227]:
stop_words = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]

# Bag of Words

In [228]:
class BagOfWords:
    def _add_to_vocabulary(self, words):
        for i in words:
            self.vocabulary[i] = self.word_index
            self.word_index += 1
    
    def get_words(self, sentence):
        return remove_punctuations(sentence).lower().split(' ')
    
    def fit(self, text):
        self.vocabulary = {}
        self.word_index = 0
        for i in text:
            words = self.get_words(i)
            words_not_captured = [i for i in words if i not in self.vocabulary]
            self._add_to_vocabulary(words_not_captured)
        
    def transform(self, text):
        word_matrix = []
        for i in range(len(text)):
            words = self.get_words(text[i])
            
            unique_words = set(words)
        
            for j in unique_words:
                count = words.count(j)
                word_matrix.append([i,self.vocabulary[j],count])
        
        return pd.DataFrame(word_matrix, columns = ['sentence','word','occurance'])

In [229]:
sentences = ['Hello World!','Random Words!','Harry Potter and the Chamber of Secrets','Harry Potter and the goblet of fire','Harry Potter and chamber of secrets','Mad Mad Mad Mad World']

In [230]:
engine = BagOfWords()
engine.fit(sentences)

In [188]:
engine.vocabulary

{'and': 6,
 'chamber': 8,
 'fire': 12,
 'goblet': 11,
 'harry': 4,
 'hello': 0,
 'mad': 16,
 'of': 9,
 'potter': 5,
 'random': 2,
 'secrets': 10,
 'the': 7,
 'words': 3,
 'world': 1}

In [231]:
engine.transform(sentences)

Unnamed: 0,sentence,word,occurance
0,0,0,1
1,0,1,1
2,1,3,1
3,1,2,1
4,2,5,1
5,2,8,1
6,2,7,1
7,2,10,1
8,2,6,1
9,2,9,1


# Working with Dataframe

In [190]:
df_sentences = pd.DataFrame(sentences)
print(df_sentences)
df_sentences[0].values.tolist()

                                         0
0                             Hello World!
1                            Random Words!
2  Harry Potter and the Chamber of Secrets
3      Harry Potter and the goblet of fire
4      Harry Potter and chamber of secrets
5                    Mad Mad Mad Mad World


['Hello World!',
 'Random Words!',
 'Harry Potter and the Chamber of Secrets',
 'Harry Potter and the goblet of fire',
 'Harry Potter and chamber of secrets',
 'Mad Mad Mad Mad World']

In [191]:
engine = BagOfWords()
engine.fit(df_sentences[0].values.tolist())

### Dict to Pandas

In [192]:
vocabulary = engine.vocabulary

In [193]:
b = pd.DataFrame([vocabulary.keys(),vocabulary.values()]).transpose()
b.columns = ['word','value']
b

Unnamed: 0,word,value
0,hello,0
1,world,1
2,random,2
3,words,3
4,harry,4
5,potter,5
6,and,6
7,the,7
8,chamber,8
9,of,9


In [194]:
word_vector = engine.transform(sentences)

In [195]:
print(word_vector)

    sentence  word  occurance
0          0     0          1
1          0     1          1
2          1     3          1
3          1     2          1
4          2     5          1
5          2     8          1
6          2     7          1
7          2    10          1
8          2     6          1
9          2     9          1
10         2     4          1
11         3     5          1
12         3    12          1
13         3    11          1
14         3     7          1
15         3     6          1
16         3     9          1
17         3     4          1
18         4     5          1
19         4     8          1
20         4    10          1
21         4     6          1
22         4     9          1
23         4     4          1
24         5    16          4
25         5     1          1


# Convert into sparse matrix for computations

In [196]:
sparse_matrix = csr_matrix((word_vector['occurance'],(word_vector['sentence'],word_vector['word'])))
print(sparse_matrix)

  (0, 0)	1
  (0, 1)	1
  (1, 2)	1
  (1, 3)	1
  (2, 4)	1
  (2, 5)	1
  (2, 6)	1
  (2, 7)	1
  (2, 8)	1
  (2, 9)	1
  (2, 10)	1
  (3, 4)	1
  (3, 5)	1
  (3, 6)	1
  (3, 7)	1
  (3, 9)	1
  (3, 11)	1
  (3, 12)	1
  (4, 4)	1
  (4, 5)	1
  (4, 6)	1
  (4, 8)	1
  (4, 9)	1
  (4, 10)	1
  (5, 1)	1
  (5, 16)	4


# For Similarity Between Sentences

In [197]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [198]:
similarity = cosine_similarity(sparse_matrix)
similarity

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.17149859],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  1.        ,  0.71428571,  0.9258201 ,
         0.        ],
       [ 0.        ,  0.        ,  0.71428571,  1.        ,  0.6172134 ,
         0.        ],
       [ 0.        ,  0.        ,  0.9258201 ,  0.6172134 ,  1.        ,
         0.        ],
       [ 0.17149859,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ]])

Make Diagonal Elements Zero because they are always similar

In [222]:
np.fill_diagonal(similarity,0)
similarity

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.17149859],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.71428571,  0.9258201 ,
         0.        ],
       [ 0.        ,  0.        ,  0.71428571,  0.        ,  0.6172134 ,
         0.        ],
       [ 0.        ,  0.        ,  0.9258201 ,  0.6172134 ,  0.        ,
         0.        ],
       [ 0.17149859,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

In [223]:
similarity[similarity>0.5]

for i in range(len(similarity)):
    condition_check = similarity[i]>0.5
    print(i,np.where(condition_check)[0].tolist(),similarity[i][condition_check].tolist())

0 [] []
1 [] []
2 [3, 4] [0.7142857142857141, 0.9258200997725515]
3 [2, 4] [0.7142857142857141, 0.6172133998483676]
4 [2, 3] [0.9258200997725515, 0.6172133998483676]
5 [] []
