# Can we predict the category of an app from the app's description?

We wanted to see if we can create a Machine Learning model that will accuretly predict the category of an app from the app's description. To do this we created a web scraper to scrape data from the GooglePlay Store. After cleaning our data, we performed NLTK, Feature Engineering and Model Fitting to create an optimal ML Model. 

<br>

## Web Scraping

We created a web scraper to collect 60 app descriptions per category for 18 categories. After collecting our data we saved it to a file in dictionary format. 

In [None]:
import re
import pandas as pd
import numpy as np
read_dictionary = np.load('my_file.npy').item()

data  = read_dictionary

print(data.keys())
data['EDUCATION']

<br>

## NLTK: Natural Language Tool Kit

In [None]:
import nltk
import sklearn

from nltk.collocations import *
from nltk import FreqDist, word_tokenize
import string, re
from nltk.stem.snowball import SnowballStemmer

pattern = "([a-zA-Z]+(?:'[a-z]+)?)"

# stop words
from nltk.corpus import stopwords
stopwords.words("english")

stop_words = set(stopwords.words('english'))

# stem words
stemmer = SnowballStemmer("english")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()


def text_cleaner(description):
    '''uses regex to tokenize words and capture them from the description, 
    lowers the capitilization remove stop words, reduce to stem words, 
    and joins them all in a string'''
    tokens_raw = nltk.regexp_tokenize(description, pattern)
    tokens = [i.lower() for i in tokens_raw]
    tokens_stopped = [w for w in tokens if not w in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens_stopped]
    cleaned = ' '.join(stemmed)
    return cleaned



def dict_cleaner(dictionary):
    '''iterates through the dictionary values in each key (category) 
    and cleans each description and adds it back to a new list'''
    description_list = []
    for c, d in dictionary.items():
        for description in d:
            cleaned = text_cleaner(description)
            description_list.append(cleaned)
    return description_list


#use our function on our data
description_list = dict_cleaner(data)
description_list[0]

<br>

## TF-IDF

We use Tf-Idif function to transform our data to reflect how important a word is in the collection of descriptions in each category.

In [None]:
response = tfidf.fit_transform(description_list)

df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())

print(df.shape)
df.head()

<br>

## Corpus Statistics

How many non-zero elements are there?

In [None]:
non_zero_cols = response.nnz / float(response.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Reviews: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(response.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

<br>

## Test-Train-Split

In [None]:
#We have to create labels in order to label the words that appear in each category
def create_labels(dictionary):
    x = dictionary.keys()
    new_list = []
    for c in x: 
        s = [c] * len(dictionary[c])
        new_list += s
    return new_list


labels = create_labels(data)
df['labels'] = labels

print(df.shape)
df.head()

<br>

In [None]:
#we set our labels to y and set our features to x
y = df.labels

X = df.iloc[:,:-1]

#set our train and test data
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


<br>

## K-Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
knn_train_preds = knn.predict(X_train)
knn_test_preds = knn.predict(X_test)

knn_train_score = accuracy_score(y_train, knn_train_preds)
knn_test_score = accuracy_score(y_test, knn_test_preds)

print("KNN")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(knn_train_score, knn_test_score))
print("F1 Score: {}".format(f1_score(y_test, knn_test_preds, average='micro')))

<br>

## Naive-Bayes Multinomial Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()

nb_classifier.fit(X_train, y_train)
nb_train_preds = nb_classifier.predict(X_train)
nb_test_preds = nb_classifier.predict(X_test)

nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("F1 Score: {}".format(f1_score(y_test, nb_test_preds, average='micro')))

<br>

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=50)

rf_classifier.fit(X_train, y_train)
rf_train_preds = rf_classifier.predict(X_train)
rf_test_preds = rf_classifier.predict(X_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))
print("F1 Score: {}".format(f1_score(y_test, rf_test_preds, average='micro')))

<br>

## SVM

In [None]:
from sklearn import svm

svm_clf = svm.SVC(probability=True)

svm_clf.fit(X_train, y_train)
svm_train_preds = rf_classifier.predict(X_train)
svm_test_preds = rf_classifier.predict(X_test)

svm_train_score = accuracy_score(y_train, svm_train_preds)
svm_test_score = accuracy_score(y_test, svm_test_preds)

print('SVM')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(svm_train_score, svm_test_score))
print("F1 Score: {}".format(f1_score(y_test, svm_test_preds, average='micro')))

<br>

## TPOT for automated model selection

In [None]:
from tpot import TPOTClassifier


tpot = TPOTClassifier(generations=5, cv = 3 ,population_size=20,\
                      max_eval_time_mins=10, verbosity=3)


#we ran the classifier, which will tell us the best model to use.
tpot.fit(X_train, y_train)


'''After changing the generation and population parameters to get better results, 
   we came up with the final best result. 
   The Automated model selection gave us the best model to use which yielded: 
   
   
   exported_pipeline = LinearSVC(C=1, dual=True, loss="squared_hinge", penalty="l2")'''



<br>

## LinearSVC

In [None]:
from sklearn.svm import LinearSVC

lsvc_classifier = LinearSVC(C=10.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)

lsvc_classifier.fit(X_train, y_train)
lsvc_train_preds = lsvc_classifier.predict(X_train)
lsvc_test_preds = lsvc_classifier.predict(X_test)

lsvc_train_score = accuracy_score(y_train, lsvc_train_preds)
lsvc_test_score = accuracy_score(y_test, lsvc_test_preds)

print('LinearSVC')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(lsvc_train_score, lsvc_test_score))
print("F1 Score: {}".format(f1_score(y_test, lsvc_test_preds, average='micro')))

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


gb_clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=6, max_features=0.2, min_samples_leaf=3, min_samples_split=15, n_estimators=100, subsample=0.25)

gb_clf.fit(X_train, y_train)
gb_train_preds = gb_clf.predict(X_train)
gb_test_preds = gb_clf.predict(X_test)

gb_train_score = accuracy_score(y_train, gb_train_preds)
gb_test_score = accuracy_score(y_test, gb_test_preds)

print('Gradient Boosting')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(gb_train_score, gb_test_score))
print("F1 Score: {}".format(f1_score(y_test, gb_test_preds, average='micro')))

<br>

## AdaBoosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_clf = AdaBoostClassifier()

adaboost_clf.fit(X_train, y_train)
adaboost_train_preds = adaboost_clf.predict(X_train)
adaboost_test_preds = adaboost_clf.predict(X_test)

adaboost_train_score = accuracy_score(y_train, adaboost_train_preds)
adaboost_test_score = accuracy_score(y_test, adaboost_test_preds)

print('AdaBoosting')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(adaboost_train_score, adaboost_test_score))
print("F1 Score: {}".format(f1_score(y_test, adaboost_test_preds, average='micro')))

## XGBoosting

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier


xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
xgb_train_preds = xgb_clf.predict(X_train)
xgb_test_preds = xgb_clf.predict(X_test)

xgb_train_score = accuracy_score(y_train, xgb_train_preds)
xgb_test_score = accuracy_score(y_test, xgb_test_preds)

print('AdaBoosting')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(xgb_train_score, xgb_test_score))
print("F1 Score: {}".format(f1_score(y_test, xgb_test_preds, average='micro')))

## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier


svc1 = LinearSVC(C=25.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.001)
svc2 = LinearSVC(C=1.0, dual=True, loss="hinge", penalty="l2", tol=1e-05)



vc_clf = VotingClassifier(estimators=[('svc1', svc1), 
                                     ('svc2', svc2),
                                     ('gb_clf', gb_clf),
                                    ('nb', nb_classifier),
                                    ('knn', knn),
                                    ('rf', rf_classifier)], voting='hard')

vc_clf.fit(X_train, y_train)

In [None]:
vc_train_preds = vc_clf.predict(X_train)
vc_test_preds = vc_clf.predict(X_test)

vc_train_score = accuracy_score(y_train, vc_train_preds)
vc_test_score = accuracy_score(y_test, vc_test_preds)

print('Voting Classifier')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(vc_train_score, vc_test_score))
print("F1 Score: {}".format(f1_score(y_test, vc_test_preds, average='micro')))

## PCA Experiment

In [None]:
from sklearn.decomposition import PCA

pca = PCA(.95)
pca.n_components_

pca_train = pca.fit_transform(X_train)
pca_test = pca.transform(X_test)

