In [154]:

#Converting categorical data into numbers with Pandas and Scikit-learn
#feature extraction. 
#When it involves a lot of manual work, this is often referred to as feature engineering.

In [69]:
import numpy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [70]:
content= ["hello How ARE YOU there", "How is  ARE everything everyting","I am not sure how this is possible", "How are you"]

In [71]:
vectorizer = CountVectorizer(min_df=1)
X_train = vectorizer.fit_transform(content)

In [72]:
vectorizer.get_feature_names()

[u'am',
 u'are',
 u'everything',
 u'everyting',
 u'hello',
 u'how',
 u'is',
 u'not',
 u'possible',
 u'sure',
 u'there',
 u'this',
 u'you']

In [73]:
print X_train.shape
print X_train.toarray()

(4, 13)
[[0 1 0 0 1 1 0 0 0 0 1 0 1]
 [0 1 1 1 0 1 1 0 0 0 0 0 0]
 [1 0 0 0 0 1 1 1 1 1 0 1 0]
 [0 1 0 0 0 1 0 0 0 0 0 0 1]]


In [74]:
new_post = "How ARE how YOU there"
new_post_vec = vectorizer.transform([new_post])
print new_post_vec

  (0, 1)	1
  (0, 5)	2
  (0, 10)	1
  (0, 12)	1


In [161]:
print new_post_vec.toarray()

[[0 1 0 0 0 1 0 0 0 0 1 0 1]]


In [162]:
#Similarity Calculations; Calculate Eculidean Distance between the count vectors of the new post and ll the old posts as below:

In [163]:
import scipy as sp
def dist_raw(v1,v2):
    delta= v1-v2
    return sp.linalg.norm(delta.toarray()) #norm() calculates the Eculidean norm i.e. shortest distance

def dist_norm(v1,v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta= v1_normalized-v2_normalized
    return sp.linalg.norm(delta.toarray()) #norm() calculates the Eculidean norm i.e. shortest distance

In [76]:
import sys
best_doc = None
best_i = None
num_samples = len(content)

def best_match(X_train,new_post_vec):
    best_dist = sys.maxint
    for i in range(0, num_samples):
        post = content[i]
        if post == new_post:
            continue
        post_vec = X_train.getrow(i)
        #d = dist_raw(post_vec, new_post_vec)
        d = dist_norm(post_vec, new_post_vec)
        print "===Post %i with dist = %.2f: %s"%(i,d,post)
        if d< best_dist:
            best_dist = d
            best_i = i
    print "Best post is %i with dist = %.4f"%(best_i,best_dist)
best_match(X_train, new_post_vec)

===Post 0 with dist = 0.56: hello How ARE YOU there
===Post 1 with dist = 0.99: How is  ARE everything everyting
===Post 2 with dist = 1.20: I am not sure how this is possible
===Post 3 with dist = 0.50: How are you
Best post is 3 with dist = 0.5042


In [77]:
print X_train.getrow(0).toarray()
print new_post_vec.toarray()

[[0 1 0 0 1 1 0 0 0 0 1 0 1]]
[[0 1 0 0 0 2 0 0 0 0 1 0 1]]


In [78]:
#Removing less important words
#Remove more frequent words that do not help to distinguish netween different texts. 
#MODIFY YOUR Vectorizer
vectorizer2 = CountVectorizer(min_df =1, stop_words='english')
sorted(vectorizer2.get_stop_words())[0:10]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost']

In [79]:
X_train2 = vectorizer2.fit_transform(content)
vectorizer2.get_feature_names()
new_post_vec2 = vectorizer2.transform([new_post])
best_match(X_train2, new_post_vec2)

===Post 0 with dist = 1.00: hello How ARE YOU there
===Post 1 with dist = 1.00: How is  ARE everything everyting
===Post 2 with dist = 1.00: I am not sure how this is possible
===Post 3 with dist = 0.00: How are you
Best post is 3 with dist = 0.0000


In [80]:
# Use NLTK to reduce words to their stem i.e. origin
import nltk.stem
s= nltk.stem.SnowballStemmer('english')
s.stem("graphics")

u'graphic'

In [83]:
'''Use StemmedCountVectorizer to do:
1. lower casing the raw post in the preprossing step done in parent calss.
2. Extracting all individual words in the tokenization step in parent class.
3. Converting each word into its stemmed version.'''
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    english_stemmer = nltk.stem.SnowballStemmer('english')
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [92]:
stem_vectorizer = StemmedCountVectorizer(min_df =1, stop_words='english')
X_train3 = stem_vectorizer.fit_transform(content)
print stem_vectorizer.get_feature_names()
print X_train3.toarray()

new_post_vec3 = stem_vectorizer.transform([new_post])
best_match(X_train3, new_post_vec3)
print("new post:"),new_post

[u'everyt', u'hello', u'possibl', u'sure']
[[0 1 0 0]
 [1 0 0 0]
 [0 0 1 1]
 [0 0 0 0]]
===Post 0 with dist = 1.00: hello How ARE YOU there
===Post 1 with dist = 1.00: How is  ARE everything everyting
===Post 2 with dist = 1.00: I am not sure how this is possible
===Post 3 with dist = 0.00: How are you
Best post is 3 with dist = 0.0000
new post: How ARE how YOU there


FootNotes:
    
    What does a rater sees when he rates an android app? == Extrinsic Features
    What an android app inherits that influences app rating? == Intrinsic Features
    
    
    Vectors to predict: 1. 5-star count, 4-star count, 3-star-count, 2-star count, 1-star count.
    Because, average app-rating depends upon the values of these values. Also on current rating of the app.
    

In [56]:
#Read CSV
import pandas as pd
from pandas import *
from numpy import *
import numpy as np
import os
from pandas import DataFrame
import numpy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import scipy as sp
# Use NLTK to reduce words to their stem i.e. origin
import nltk.stem
# Use NLTK to reduce words to their stem i.e. origin
import nltk.stem

In [107]:
app_file = '../data/big-data-csv.csv'
appdf = pd.read_csv(app_file,sep=',')
appdf.head(2)

Unnamed: 0.1,Unnamed: 0,Category,Score,Description,Price,PublicationDate,AppSize,Name,ContentRating,LastUpdateDate,Instalations,IsTopDeveloper,HaveInAppPurchases,IsFree,Developer
0,0,NEWS_AND_MAGAZINES,5.0,Read the most popular newspapers from Sweden ...,0,2015-07-08T03:00:00.000Z,2.9,Sweden News,Everyone 10+,2015-07-08T03:00:00.000Z,50 - 100,False,False,True,News Now
1,1,MEDIA_AND_VIDEO,2.882353,Sweden Tv channels guide. Tv Sweden include lo...,0,2015-07-25T03:00:00.000Z,2.8,Tv Sweden,Everyone,2015-07-25T03:00:00.000Z,"5,000 - 10,000",False,False,True,QSC





In [7]:
col_cat = appdf.Category
col_cat.head(2)

0    NEWS_AND_MAGAZINES
1       MEDIA_AND_VIDEO
Name: Category, dtype: object

In [4]:
appdf.columns.values.tolist()

['Unnamed: 0',
 'Category',
 'Score',
 'Description',
 'Price',
 'PublicationDate',
 'AppSize',
 'Name',
 'ContentRating',
 'LastUpdateDate',
 'Instalations',
 'IsTopDeveloper',
 'HaveInAppPurchases',
 'IsFree',
 'Developer']

In [8]:
len(col_cat.unique())
col_cat.unique()

array(['NEWS_AND_MAGAZINES', 'MEDIA_AND_VIDEO', 'ENTERTAINMENT', 'FINANCE',
       'MUSIC_AND_AUDIO', 'TRAVEL_AND_LOCAL', 'EDUCATION', 'BUSINESS',
       'PERSONALIZATION', 'TRANSPORTATION', 'SPORTS', 'SOCIAL',
       'COMMUNICATION', 'PHOTOGRAPHY', 'LIFESTYLE', 'HEALTH_AND_FITNESS',
       'TOOLS', 'PRODUCTIVITY', 'WEATHER', 'BOOKS_AND_REFERENCE',
       'GAME_TRIVIA', 'MEDICAL', 'GAME_PUZZLE', 'GAME_CASUAL', 'SHOPPING',
       'GAME_MUSIC', 'GAME_ACTION', 'GAME_ARCADE', 'GAME_SIMULATION',
       'GAME_CARD', 'GAME_CASINO', 'LIBRARIES_AND_DEMO',
       'GAME_EDUCATIONAL', 'GAME_SPORTS', 'GAME_WORD', 'GAME_RACING',
       'GAME_ROLE_PLAYING', 'GAME_BOARD', 'COMICS', 'GAME_STRATEGY',
       'GAME_ADVENTURE'], dtype=object)

In [87]:
'''Use StemmedCountVectorizer to do:
1. lower casing the raw post in the preprossing step done in parent calss.
2. Extracting all individual words in the tokenization step in parent class.
3. Converting each word into its stemmed version.'''
import nltk.stem
s= nltk.stem.SnowballStemmer('english')
s.stem("graphics")
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        english_stemmer = nltk.stem.SnowballStemmer('english')
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [124]:
def vectorize_column(dataframe,column_name,vectorizer=None):
    if vectorizer is None:
        print("No Vectorizer is explicitly specified. Using CountVectorizer as default one. ")
        column_vectorizer = CountVectorizer(min_df=1)
    else:
        column_vectorizer = vectorizer
    if column_name in dataframe.columns.values.tolist():
        column_df = dataframe[column_name]
        fmatrix = column_vectorizer.fit_transform(column_df)
        print column_vectorizer.get_feature_names()
        #print("vectorized into matrix of shape"), fmatrix.toarray().shape
        dataframe_f = pd.DataFrame(fmatrix.toarray(), columns=column_vectorizer.get_feature_names())
        print("formed dataframe of size:("),dataframe_f.index.max()+1,",", dataframe_f.head(1).shape[1],")"
        
        return dataframe_f, fmatrix, column_vectorizer
    else:
        print("No column found")


In [111]:
#column_vectorizer = CountVectorizer(min_df=1)
#column_vectorizer = CountVectorizer(min_df =1, stop_words='english')
stem_vectorizer = StemmedCountVectorizer(min_df =1, stop_words='english')
newfeature, fmatrix, column_vectorizer = vectorize_column(appdf, 'Category', stem_vectorizer)
#print column_vectorizer.get_feature_names()
newfeature.head(5)

n_samples in column: 100000
vectorized into matrix of shape (100000, 41)
formed dataframe of size:( 100000 , 41 )


Unnamed: 0,books_and_refer,busi,comic,communic,educ,entertain,financ,game_act,game_adventur,game_arcad,...,person,photographi,product,shop,social,sport,tool,transport,travel_and_loc,weather
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
def dist_norm(v1,v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta= v1_normalized-v2_normalized
    return sp.linalg.norm(delta.toarray()) #norm() calculates the Eculidean norm i.e. shortest distance"

#### '''Analysis of 'Category' data-columns
#Each of the application has only one 'category' so the each of the category is equi-distance from all other.
Though similarity of each of the values of category is same, the category-name itself might not effect the rating equally.
That is why they are inluded as training features.'''

In [94]:
import sys
best_doc = None
best_i = None

def best_match(column_vectorizer, fmatrix,text_to_compare):
    n_samples = 100 # fmatrix.shape[0]
    best_dist = sys.maxint
    vect_to_compare = column_vectorizer.transform(text_to_compare)
    for i in range(0, n_samples):
        text_in_column = col_cat[i]
        if text_in_column == text_to_compare[0]:
            continue
        vector_for_column_text = fmatrix.getrow(i)
        #d = dist_raw(post_vec, new_post_vec)
        d = dist_norm(vector_for_column_text, vect_to_compare)
        print "===Category of app- %i with dist = %.2f: %s"%(i,d,text_in_column)
        if d < best_dist:
            best_dist = d
            best_i = i
    print "Best text in category is %i with dist = %.4f"%(best_i,best_dist)
print type([col_cat[4]])
#best_match(column_vectorizer,fmatrix, [col_cat[4]])

100000
<type 'list'>


#### #Analysis of Description Field


In [112]:
#column_vectorizer = CountVectorizer(min_df=1)
#column_vectorizer = CountVectorizer(min_df =1, stop_words='english')
stem_vectorizer = StemmedCountVectorizer(min_df =1, stop_words='english')
newfeature, fmatrix, column_vectorizer = vectorize_column(appdf, 'Description', stem_vectorizer)
#print column_vectorizer.get_feature_names()
newfeature.head(5)

n_samples in column: 100000
vectorized into matrix of shape

MemoryError: 

#### Analysis of Name Field
    suggest some of the price for higer number of sale
    w1: parameterized loudness of words in context

In [137]:
col_name = appdf.Name
print col_name[col_name.str.contains('000')]


#stem_vectorizer = StemmedCountVectorizer(min_df =1, stop_words='english')
#newfeature, fmatrix, column_vectorizer = vectorize_column(appdf, 'Name', stem_vectorizer)
#print column_vectorizer.get_feature_names()
#newfeature.head(5)


1000     5000+ Cute Love SMS Collection
1003      30000+ Funny Jokes Collection
1007     90000+ SMS Messages Collection
1008     75000+ SMS Messages Collection
1578             DuPont™ Tychem® 4000 S
2591        Learn Ukrainian 6,000 Words
3945                      Sudoku 10'000
3948                 Sudoku 10'000 Free
3953                 Sudoku 10'000 Plus
4350                     1000 Aventuras
6369     フルル大辞典 ～即引き！略語・用語・薬辞典 10,000語～
6401                 MedCalc 3000 中文精华版
7228        プリンス育成☆マジLOVE3000％ for うたプリ
7322                20000 Leagues Slots
9173             Indian Recipes 10.000+
9320      2000 Piadas Engraçadas Brasil
10048    50000 Status Quotes Collection
10251                      鬼監督の1000本ノック
11380             1000 Recipes in Hindi
12334           Desert Race Toyota 1000
13523                     Taxxi 4209000
13630                مسجات +15000 رسالة
13702                    Solitaire 1000
14624         Learn Chinese 6,000 Words
14627      Learn Chinese 10000 Mandarin


#### Analysis of 'Instalations'
    It is range values.

In [225]:
col_name = appdf.Instalations
def separate_instalation_column(dataframe, column_name,return_data_type_as=None):
    
    col_name = appdf[column_name]
    ls = col_name.str.split('-').str.get(0).str.strip(' ').str.replace(',','') #series object
    hs = col_name.str.split('-').str.get(1).str.strip(' ').str.replace(',','') #series object
    
    if return_data_type_as is float64:
        ls = ls.astype(float).fillna(0.0)
        hs = hs.astype(float).fillna(0.0)
        return ls, hs
    else:
        return ls, hs
    
ls, hs = separate_instalation_column(appdf,'Instalations', float64)
appdf.installs_ls = ls
appdf.installs_hs = hs
print appdf.installs_ls.head(5) + appdf.installs_hs.head(5)

0        150
1      15000
2       6000
3    1500000
4       6000
dtype: float64
