In [154]:

#Converting categorical data into numbers with Pandas and Scikit-learn
#feature extraction. 
#When it involves a lot of manual work, this is often referred to as feature engineering.

In [155]:
import numpy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [156]:
content= ["hello How ARE YOU there", "How is  ARE everything everyting","I am not sure how this is possible", "How are you"]

In [157]:
vectorizer = CountVectorizer(min_df=1)
X_train = vectorizer.fit_transform(content)

In [158]:
vectorizer.get_feature_names()

[u'am',
 u'are',
 u'everything',
 u'everyting',
 u'hello',
 u'how',
 u'is',
 u'not',
 u'possible',
 u'sure',
 u'there',
 u'this',
 u'you']

In [159]:
print X_train.shape
print X_train.toarray()

(4, 13)
[[0 1 0 0 1 1 0 0 0 0 1 0 1]
 [0 1 1 1 0 1 1 0 0 0 0 0 0]
 [1 0 0 0 0 1 1 1 1 1 0 1 0]
 [0 1 0 0 0 1 0 0 0 0 0 0 1]]


In [160]:
new_post = "How ARE YOU there"
new_post_vec = vectorizer.transform([new_post])
print new_post_vec

  (0, 1)	1
  (0, 5)	1
  (0, 10)	1
  (0, 12)	1


In [161]:
print new_post_vec.toarray()

[[0 1 0 0 0 1 0 0 0 0 1 0 1]]


In [162]:
#Similarity Calculations; Calculate Eculidean Distance between the count vectors of the new post and ll the old posts as below:

In [163]:
import scipy as sp
def dist_raw(v1,v2):
    delta= v1-v2
    return sp.linalg.norm(delta.toarray()) #norm() calculates the Eculidean norm i.e. shortest distance

def dist_norm(v1,v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta= v1_normalized-v2_normalized
    return sp.linalg.norm(delta.toarray()) #norm() calculates the Eculidean norm i.e. shortest distance

In [186]:
import sys
best_doc = None
print best_dist
best_i = None
num_samples = len(content)

def best_match(X_train,new_post_vec):
    best_dist = sys.maxint
    for i in range(0, num_samples):
        post = content[i]
        if post == new_post:
            continue
        post_vec = X_train.getrow(i)
        #d = dist_raw(post_vec, new_post_vec)
        d = dist_norm(post_vec, new_post_vec)
        print "===Post %i with dist = %.2f: %s"%(i,d,post)
        if d< best_dist:
            best_dist = d
            best_i = i
    print "Best post is %i with dist = %.4f"%(best_i,best_dist)
best_match(X_train, new_post_vec)

9223372036854775807
===Post 0 with dist = 0.46: hello How ARE YOU there
===Post 1 with dist = 1.05: How is  ARE everything everyting
===Post 2 with dist = 1.27: I am not sure how this is possible
===Post 3 with dist = 0.52: How are you
Best post is 0 with dist = 0.4595


In [165]:
print X_train.getrow(0).toarray()
print new_post_vec.toarray()

[[0 1 0 0 1 1 0 0 0 0 1 0 1]]
[[0 1 0 0 0 1 0 0 0 0 1 0 1]]


In [187]:
#Removing less important words
#Remove more frequent words that do not help to distinguish netween different texts. 
#MODIFY YOUR Vectorizer

vectorizer2 = CountVectorizer(min_df =1, stop_words='english')
sorted(vectorizer2.get_stop_words())[0:10]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost']

In [188]:
X_train2 = vectorizer2.fit_transform(content)
vectorizer2.get_feature_names()
new_post_vec2 = vectorizer2.transform([new_post])
best_match(X_train2, new_post_vec2)

===Post 0 with dist = 1.00: hello How ARE YOU there
===Post 1 with dist = 1.00: How is  ARE everything everyting
===Post 2 with dist = 1.00: I am not sure how this is possible
===Post 3 with dist = 0.00: How are you
Best post is 3 with dist = 0.0000


In [173]:
# Use NLTK to reduce words to their stem i.e. origin
import nltk.stem
s= nltk.stem.SnowballStemmer('english')
s.stem("graphics")

u'graphic'

In [189]:
'''Use StemmedCountVectorizer to do:
1. lower casing the raw post in the preprossing step done in parent calss.
2. Extracting all individual words in the tokenization step in parent class.
3. Converting each word into its stemmed version.'''

class StemmedCountVectorizer(CountVectorizer):
    english_stemmer = nltk.stem.SnowballStemmer('english')
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))


FootNotes:
    
    What does a rater sees when he rates an android app? == Extrinsic Features
    What an android app inherits that influences app rating? == Intrinsic Features
    
    
    Vectors to predict: 1. 5-star count, 4-star count, 3-star-count, 2-star count, 1-star count.
    Because, average app-rating depends upon the values of these values. Also on current rating of the app.
    