In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import numpy as np

In [2]:
categories = ['EDUCATION', 'BUSINESS', 
       'DATING', 'SPORTS', 
       'WEATHER', 'FOOD_AND_DRINK', 
       'HEALTH_AND_FITNESS', 'BEAUTY', 
       'MUSIC_AND_AUDIO', 'NEWS_AND_MAGAZINES', 
       'SOCIAL', 'SHOPPING', 
       'PRODUCTIVITY', 'PHOTOGRAPHY', 
       'MEDICAL', 'PARENTING', 
       'EDUCATION', 'BEAUTY',
       'COMMUNICATION', 'TOOLS']

In [None]:
def get_links(data):
    '''Given html data this will extract all the links'''
    link_list = []
    for link in data.findAll('a'):
        get_link = link.get('href')
        link_list.append(get_link)
    return link_list

In [None]:
def webpage_html(url):
    r = requests.get(url)
    html = r.text
    return html

In [None]:
def google_scraper(categories):
    '''Will cycle through every category and then request the site, pass the soup to 
        get_links and then clean the list and append results to a new list in a dict
        format that has the format of {cat_name : [link list]}'''
    dicty = dict()
    for category in categories:
        url = f'https://play.google.com/store/apps/category/{category}/collection/topselling_free'
        html = webpage_html(url)
        soup = BeautifulSoup(html)
        links = get_links(soup)
        del links[0:107]
        clean = list(set(links))
        new_list = [app for app in clean if app.find('details') != -1]
        dicty[category] = new_list
        time.sleep(3)
    return dicty


In [3]:
cat = categories

In [None]:
stuff = google_scraper(cat)

In [None]:
stuff

In [None]:
def parse(listing_html, criteria):
    result = listing_html.find(class_=re.compile(r'.*%s' % criteria))
    if result:
        return result.text

In [None]:
def get_description(category_dict):
    dicty = dict()
    url = 'https://play.google.com'
    for category, link_list in category_dict.items():
        listy = list()
        for link in link_list:
            html = webpage_html(url+link)
            soup = BeautifulSoup(html)
            html1 = soup.find('div', {'class':"LXrl4c"})
            description = parse(html1, 'DWPxHb')
            time.sleep(1)
            listy.append(description)
        dicty[category] = listy
    return dicty              

In [None]:
all_desc = get_description(stuff)

In [4]:
# # Save
# np.save('my_file.npy', all_desc) 

# # # Load
read_dictionary = np.load('my_file.npy').item()

In [5]:
edu  = read_dictionary

In [6]:
read_dictionary.keys()

dict_keys(['EDUCATION', 'BUSINESS', 'DATING', 'SPORTS', 'WEATHER', 'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'BEAUTY', 'MUSIC_AND_AUDIO', 'NEWS_AND_MAGAZINES', 'SOCIAL', 'SHOPPING', 'PRODUCTIVITY', 'PHOTOGRAPHY', 'MEDICAL', 'PARENTING', 'COMMUNICATION', 'TOOLS'])

In [7]:
import nltk
import sklearn

from nltk.collocations import *
from nltk import FreqDist, word_tokenize
import string, re
from nltk.stem.snowball import SnowballStemmer

In [8]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"

In [9]:
# stop words
from nltk.corpus import stopwords
stopwords.words("english")

stop_words = set(stopwords.words('english'))

In [10]:
# stem words
stemmer = SnowballStemmer("english")

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [12]:
review_list = []
for c, d in edu.items():
    for review in d:
        art_tokens_raw = nltk.regexp_tokenize(review, pattern)
        art_tokens = [i.lower() for i in art_tokens_raw]
        art_tokens_stopped = [w for w in art_tokens if not w in stop_words]
        art_stemmed = [stemmer.stem(word) for word in art_tokens_stopped]
        cleaned = ' '.join(art_stemmed)
        review_list.append(cleaned)

In [13]:
response = tfidf.fit_transform(review_list )

df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())


In [14]:
x = read_dictionary.keys()
new_list = []
for c in x: 
    s = [c] * 60
    new_list += s

In [15]:
len(new_list)

1080

In [16]:
df.shape

(1080, 13205)

In [17]:
df['labels'] = new_list

In [18]:
df.tail()

Unnamed: 0,aa,aac,aaptiv,aarp,aask,ab,abajo,abandon,abc,abcmous,...,zoomterrain,zoomwithushav,zoosk,zte,zulili,zulu,zulufor,zulupermiss,zumba,labels
1075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS
1079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOOLS


In [None]:
new_desc = """Trello gives you perspective over all your projects, at work and at home.

Bring Trello to Mac in a dedicated workspace. The Trello Desktop App features native notifications, powerful enhancements and more - away from the distractions of your browser.

Stay productive with a beautiful minimal interface that doesn’t get in the way of your work.
Create new cards instantly from anywhere with a Quick Add window.
Get notified whenever there’s new activity in any of your Trello boards.
Work on multiple boards at once with multiple windows.
Set a global shortcut that opens the main window from anywhere.
Navigate between your starred boards with a quick shortcut.
Use Touch Bar to view starred boards, create new cards, and open new windows.

All Trello shortcuts work just like the web, including keyboard shortcuts, drag & drop and more.

---

Some useful Trello shortcuts:
Ctrl+Alt+Space - Open Quick Add window to quickly create a card (customizable)
Cmd+Shift+N - Create another window
Right-click on card - quick edit menu
Cmd+Alt+C - Copy URL of current open card or board
Cmd+Alt+V - Open any Trello card or board by pasting it into the app from your clipboard
Cmd+Alt+T - Open app from anywhere (customizable)
Cmd+1-9 - Quick access to your Starred Boards
Cmd+Shift+B - Open Boards page
Cmd+D - Open Default Board

---

Whether it's managing a team, writing an epic screenplay, or just making a grocery list, Trello is your sidekick for getting things done and staying organized.

"Trello is an awesome project management tool that makes collaboration easy and, dare I say, even fun."
LIFEHACKER

With Trello you can:

• Create boards to organize anything you're working on
• Use them solo or invite coworkers, friends and family to collaborate
• Customize workflows for different projects
• Add checklists of "To-Dos" on cards
• Assign tasks to yourself and coworkers
• Comment on items with your friends
• Upload photos and videos
• Attach files
• Display cards in a calendar view with the Calendar Power-Up
• Trello is free to use forever with options to upgrade to Gold for loads of extra fun and functionality"""

# Test-Train Split

In [19]:
y = df.labels

In [20]:
X = df.iloc[:,:-1]

In [21]:
# Sample code
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

# TPOT for automated model selection

In [23]:
from tpot import TPOTClassifier

In [65]:
config = {
    'sklearn.naive_bayes.MultinomialNB': {
        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
        'fit_prior': [True, False]
    },
    'sklearn.svm.LinearSVC' : {
        'C' : [1, 5, 10, 15, 20, 25],
        'dual' : [True, False],
        'loss' : ["squared_hinge", "hinge" ],
        'penalty' : ['l1', 'l2']
    },
    'sklearn.neighbors.KNeighborsClassifier' : {
    },
    'sklearn.ensemble.RandomForestClassifier' : {
        
    }
}

In [67]:
tpot = TPOTClassifier(generations=3, cv = 2 ,population_size=10,\
                      max_eval_time_mins=10, verbosity=3, config_dict=config)

In [68]:
tpot.fit(X_train, y_train)

4 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=40, style=ProgressStyle(descripti…

_pre_test decorator: _random_mutation_operator: num_test=0 Input X must be non-negative
_pre_test decorator: _random_mutation_operator: num_test=0 Input X must be non-negative
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True
_pre_test decorator: _random_mutation_operator: num_test=0 Input X must be non-negative
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=True
_pre_test decorator: _random_mutation_operator: num_test=0 Input X must be non-negative
Generation 1 - Current Pareto front scores:
-1	0.7361025514606188	LinearSVC(input_matrix, LinearSVC__C=20, LinearSVC__dual=True, LinearSVC__loss=squared_hinge, LinearSVC__penalty=l2)

_pre_test decorator:

TPOTClassifier(config_dict={'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.svm.LinearSVC': {'C': [1, 5, 10, 15, 20, 25], 'dual': [True, False], 'loss': ['squared_hinge', 'hinge'], 'penalty': ['l1', 'l2']}, 'sklearn.neighbors.KNeighborsClassifier': {}, 'sklearn.ensemble.RandomForestClassifier': {}},
        crossover_rate=0.1, cv=2, disable_update_check=False,
        early_stop=None, generations=3, max_eval_time_mins=10,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=None, periodic_checkpoint_folder=None,
        population_size=10, random_state=None, scoring=None, subsample=1.0,
        use_dask=False, verbosity=3, warm_start=False)

In [69]:
tpot.export('class4-pipeline.py')

True

In [None]:
# 0.8009259259259259 First pipeline trial
# 0.8055555555555556 Second pipeline trial
# 0.7685185185185185 Third Pipeline trial

In [28]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier


In [29]:
last = LinearSVC(C=10.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)

In [30]:
last.fit(X_train, y_train)

LinearSVC(C=10.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.1,
     verbose=0)

In [36]:
svc1 = LinearSVC(C=25.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.001)

In [37]:
svc2 = LinearSVC(C=1.0, dual=True, loss="hinge", penalty="l2", tol=1e-05)

In [38]:
gb_clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=6, max_features=0.2, min_samples_leaf=3, min_samples_split=15, n_estimators=100, subsample=0.25)

# Voting Classifier

In [35]:
from sklearn.ensemble import VotingClassifier

In [51]:
model = VotingClassifier(estimators=[('svc1', svc1), 
                                     ('svc2', svc2),
                                     ('gb_clf', gb_clf),
                                    ('nb', nb_classifier),
                                    ('knn', knn),
                                    ('rf', rf_classifier)], voting='hard')

In [49]:
import joblib

In [62]:
# model1 with: svc1, svc2, last, gb_clf, nb, knn, rf
# model2 with: svc1, svc2, gb_clf, nb, knn, rf
# last -> SVC model 
#         LinearSVC(C=10.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)

model.fit(X_train, y_train)

# model1 => 0.8101851851851852 <- voting
# model2 => 0.8148148148148148 <-voting
# last => 0.8194444444444444 <-svc

KeyboardInterrupt: 

In [57]:
# To Store Models:
# joblib.dump(model, 'model2')


# To load model
# model1 = joblib.load('model1')

# Random Forest and others

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score

In [41]:
knn = KNeighborsClassifier()

# Naive-Bayes Multinomial Classifier

In [42]:
nb_classifier = MultinomialNB()

In [None]:
nb_classifier.fit(X_train, y_train)
nb_train_preds = nb_classifier.predict(X_train)
nb_test_preds = nb_classifier.predict(X_test)

nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))

# Random Forest Classifier

In [43]:
rf_classifier = RandomForestClassifier(n_estimators=50)

rf_classifier.fit(X_train, y_train)
rf_train_preds = rf_classifier.predict(X_train)
rf_test_preds = rf_classifier.predict(X_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Random Forest
Training Accuracy: 1.0 		 Testing Accuracy: 0.7315


# SKLEARN experiment