### importing Libraries

In [11]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')

import seaborn as sns
color = sns.color_palette()
sns.set(rc={'figure.figsize':(25,15)})

import plotly as px
import plotly.graph_objs as go

import plotly.figure_factory as ff

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV


import math
import pickle
import re
from collections import Counter



import warnings
warnings.filterwarnings('ignore')


### loading datasets

In [12]:

appDf = pd.read_csv('https://raw.githubusercontent.com/abdulSalamKagaji97/AbdulSalamKagaji_DATA606/main/data/apps.csv')
appDescDf = pd.read_csv('https://raw.githubusercontent.com/abdulSalamKagaji97/AbdulSalamKagaji_DATA606/main/data/AppDescriptions.csv')

# merging two datasets based on App name

mergeDf = appDf.merge(appDescDf,on="App") 


### feature selection

In [13]:
# feature selection

mergeDf = mergeDf[[ 'App', 'Descriptions', 'Category', 'Rating', 'Reviews', 'Size',
       'Installs', 'Type', 'Price', 'Content Rating', 'Genres']]
mergeDf

Unnamed: 0,App,Descriptions,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,Filters for photos and videos Beautification t...,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design
1,Photo Editor & Candy Camera & Grid & ScrapBook,Filters for photos and videos Beautification t...,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design
2,Coloring book moana,Welcome this app for Coloring Book Encanto Car...,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play
3,Coloring book moana,Welcome this app for Coloring Book Encanto Car...,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play
4,"U Launcher Lite – FREE Live Cool Themes, Hide ...","Android themes & 4D Live wallpaper, Hide apps",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design
...,...,...,...,...,...,...,...,...,...,...,...
9786,Sya9a Maroc - FR,Moroccan driving licence tests en français and...,FAMILY,4.5,38,53.0,"5,000+",Free,0,Everyone,Education
9787,Fr. Mike Schmitz Audio Teachings,Download and Listen Fr. Mike Schmitz Audio Mes...,FAMILY,5.0,4,3.6,100+,Free,0,Everyone,Education
9788,Parkinson Exercices FR,Home Exercises for people with Parkinson's dis...,MEDICAL,,3,9.5,"1,000+",Free,0,Everyone,Medical
9789,The SCP Foundation DB fr nn5n,SCP Foundation: Idle Manager is a game that si...,BOOKS_AND_REFERENCE,4.5,114,,"1,000+",Free,0,Mature 17+,Books & Reference


### Cleaning Dataset

In [14]:
# checking for null values

mergeDf.isnull().sum()


App                  0
Descriptions         0
Category             0
Rating            1468
Reviews              0
Size              1247
Installs             0
Type                 0
Price                0
Content Rating       0
Genres               0
dtype: int64

### pre-processing data values in the dataset


In [15]:
# dropping null values

mergeDf.dropna(inplace=True)

# pre-processing data values in the dataset
mergeDf['Installs'] = [int(re.sub("[^\d\.]", "", value)) for value in mergeDf['Installs']]
mergeDf['Price'] = [float(re.sub("[^\d\.]", "", value)) for value in mergeDf['Price']]
mergeDf

Unnamed: 0,App,Descriptions,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,Filters for photos and videos Beautification t...,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design
1,Photo Editor & Candy Camera & Grid & ScrapBook,Filters for photos and videos Beautification t...,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design
2,Coloring book moana,Welcome this app for Coloring Book Encanto Car...,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play
3,Coloring book moana,Welcome this app for Coloring Book Encanto Car...,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play
4,"U Launcher Lite – FREE Live Cool Themes, Hide ...","Android themes & 4D Live wallpaper, Hide apps",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design
...,...,...,...,...,...,...,...,...,...,...,...
9783,Chemin (fr),Christian songs for praise and liturgical use ...,BOOKS_AND_REFERENCE,4.8,44,0.6,1000,Free,0.0,Everyone,Books & Reference
9784,FR Calculator,The most intuitive feet / inch calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education
9786,Sya9a Maroc - FR,Moroccan driving licence tests en français and...,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education
9787,Fr. Mike Schmitz Audio Teachings,Download and Listen Fr. Mike Schmitz Audio Mes...,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education


### Hyperparameter Tuning

In [16]:
param_grid = [{ 'clf__loss': ['hinge', 'log_loss', 'log',],
                'clf__penalty': ['l2', 'l1'], 
                'clf__max_iter': [5,10,15],
                'clf__alpha': [0.0001,0.1,1.0,5.0,10.0]
                                         }]

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(random_state=42)),
                ])

print("Hyper parameter tuning : Start")
grid_classifier = GridSearchCV(sgd, param_grid , cv=2,scoring='accuracy', refit=True,)   
grid_classifier.fit(mergeDf['Descriptions'], mergeDf['App'])

print("Hyper parameter tuning : End")


Hyper parameter tuning : Start
Hyper parameter tuning : End


In [17]:
print("Efficient Hyper parameters : ")

for param in grid_classifier.best_params_.keys():
    print(f"\t {param} : {grid_classifier.best_params_[param]}")



Efficient Hyper parameters : 
	 clf__alpha : 0.0001
	 clf__loss : log_loss
	 clf__max_iter : 10
	 clf__penalty : l2


### creating training dataset 


In [18]:

x = list(mergeDf['Descriptions'])
x = x + list(mergeDf['App'])

# vectoring output features
y = list(mergeDf['App'] +"||"+ mergeDf['Size'].astype(str) \
                +"||"+ mergeDf['Rating'].astype(str) \
                + "||"+mergeDf['Installs'].astype(str)\
                + "||"+mergeDf['Type'].astype(str) \
                + "||"+mergeDf['Price'].astype(str) \
                + "||"+mergeDf['Genres'].astype(str) \
                + "||"+mergeDf['Content Rating'].astype(str) \
                + "||" + mergeDf['Descriptions'].astype(str))
y += y
print(len(y))

14268


### data transformation and model training pipeline


In [19]:
# data transformation and model training pipeline

sgd = Pipeline([('vect', CountVectorizer()), 
                ('tfidf', TfidfTransformer()), 
                ('clf', SGDClassifier(loss='log_loss', penalty='l2', alpha=0.0001, random_state=42, max_iter=10))
                ])
sgd.fit(x, y)

y_pred_svm = sgd.predict(["scp"])
y_pred_svm[0].split("||")

['Haircut Tutorials/Haircut Videos',
 '7.1',
 '4.6',
 '10000',
 'Free',
 '0.0',
 'Beauty',
 'Everyone',
 'Step by step hairstyles tutorials, hairstyles for girls, hairstyles videos']

### Saving trained model

In [20]:
pickle.dump(sgd,open( "sgdModelMultiOut.p", "wb" ))

### Testing Saved Model

In [20]:
# loading saved model
sgd = pickle.load(open("sgdModelMultiOut.p",'rb'))

# dummty input
inputDescription = "Canva "
y_pred_svm = sgd.predict([inputDescription])

# classified output
y_pred_svm[0].split("||")

['Canva: Poster, banner, card maker & graphic design',
 '24.0',
 '4.7',
 '10000000',
 'Free',
 '0',
 'Art & Design',
 'Everyone',
 'Your background remover, photo editor & video editor']

### Determining top similar apps

In [23]:
actual_intent = inputDescription

WORD = re.compile(r"\w+")

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

similarity_score_list = []
similarity_checked_utterance = []
similarity_checked_output = []
for utterance in x:
    # print(utterance)
    text1 = actual_intent
    text2 = utterance

    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine_value = get_cosine(vector1, vector2)

    similarity_score_list.append(cosine_value)
    similarity_checked_utterance.append([cosine_value, utterance])
similarity_score_list_temp = similarity_score_list.copy()
similarity_score_list.sort(reverse=True)
similarity_checked_output_temp = []
count = 0

for _ in range(len(similarity_score_list)):
    if count <= 5:
        for utt in similarity_checked_utterance:
            if utt[0] == similarity_score_list_temp[similarity_score_list_temp.index(similarity_score_list[_])] and utt[1] not in similarity_checked_output_temp:
                similarity_checked_output_temp.append(utt[1])
                count += 1
    
for utt in similarity_checked_output_temp:
    if y[x.index(utt)] not in similarity_checked_output:
        similarity_checked_output.append(
            y[x.index(utt)])

print("Similar apps to: "+inputDescription)
print("*"*5)
print("\n ***** \n".join(similarity_checked_output[:5]))

Similar apps to: Canva 
*****
Canva: Poster, banner, card maker & graphic design||24.0||4.7||10000000||Free||0.0||Art & Design||Everyone||Your background remover, photo editor & video editor
 ***** 
Photo Editor & Candy Camera & Grid & ScrapBook||19.0||4.1||10000||Free||0.0||Art & Design||Everyone||Filters for photos and videos Beautification tools for flawless selfies
 ***** 
Coloring book moana||14.0||3.9||500000||Free||0.0||Art & Design;Pretend Play||Everyone||Welcome this app for Coloring Book Encanto Cartoon
 ***** 
U Launcher Lite – FREE Live Cool Themes, Hide Apps||8.7||4.7||5000000||Free||0.0||Art & Design||Everyone||Android themes & 4D Live wallpaper, Hide apps
 ***** 
Sketch - Draw & Paint||25.0||4.5||50000000||Free||0.0||Art & Design||Teen||Sketch and paint on your device with the feel and freedom of drawing on paper


### References

-  https://stackoverflow.com/questions/37161563/how-to-graph-grid-scores-from-gridsearchcv
