In [1]:
import pandas as pd
import pymysql
import matplotlib.style as style
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import neon_cred as neon
import pymysql
import seaborn as sns
import numpy as np
style.use('ggplot')
from IPython.display import display, HTML
import matplotlib.cm as cm
import os
from joblib import Parallel, delayed
import re
from tqdm import tqdm
%matplotlib inline
import time

# Gensim
import gensim
import nltk; nltk.download('stopwords'); nltk.download('words')
from nltk.corpus import stopwords
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import all_stops as stops

# spacy for lemmatization
import spacy



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/PocketmathUser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/PocketmathUser/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Get the scraped data

In [2]:
bundle_desc =pd.read_csv("bundle_desc.csv")
    

## Preprocessing
1. Regex based trimming
2. Stop word removal
3. Make bigrams and trigrams
4. lemmatization

In [4]:
# stop_words = stopwords.words('english')
# stop_words.extend(['from', 'use'])
stop_words = stops.extended_stop_words
stop_words.extend(["free", "android", "privacy_policy", "add", "device", "feature",
                   "phone", "good", "application", "mobile", "app", "set", "features",
                   "win", "choose", "yan", "user", "dan", "version", "san", "day",
                   "subscription", "subscribe", "para","yang"])

words = set(nltk.corpus.words.words())


# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# Do lemmatization keeping only noun, adj, vb, adv
# https://spacy.io/api/annotation


nlp = spacy.load('/anaconda3/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-2.0.0', disable=['parser', 'ner'])
allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']

In [4]:
def some_regex(x):
    x = re.sub('\S*@\S*\s?', '', x)
    x = re.sub('\s+', ' ', x)
    x = re.sub("\'", "", x)
    return x

In [5]:
bundle_desc["description"] = bundle_desc["description"].apply(some_regex)

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_non_english_words(texts):
    return [[word for word in doc if (word in words)] for doc in texts]


def remove_stop_words(texts):
    return [[word for word in doc \
             if (word not in stop_words) and (len(word) > 2)] for doc in texts]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(desc):
    doc = nlp(" ".join(desc))
    doc_list = [token.lemma_ for token in doc if (token.pos_ in allowed_postags) and \
               (token.lemma_ in words)]
    return doc_list

In [7]:
data_words = []
descriptions_list = list(bundle_desc["description"])
for desc in descriptions_list:
    # deacc=True removes punctuations
    data_words.append(gensim.utils.simple_preprocess(str(desc),
                                                     deacc=True))

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
# higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)


In [9]:
start = time.time()
data_words_lemmatized = [lemmatization(desc) for desc in data_words]
end = time.time()
print((end-start), "seconds")

1085.7185957431793 seconds


In [10]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_lemmatized)

# Remove Stop Words
data_words_nostops = remove_stop_words(data_words_bigrams)


In [11]:
from collections import Counter
word_list = [word for doc in data_words_nostops for word in doc]
x = Counter(word_list)
x.most_common()

[('game', 249221),
 ('play', 89783),
 ('photo', 58244),
 ('time', 56313),
 ('car', 54438),
 ('find', 49092),
 ('fun', 42995),
 ('level', 39975),
 ('video', 38286),
 ('object', 38264),
 ('real', 38038),
 ('friend', 35706),
 ('easy', 35517),
 ('news', 33966),
 ('city', 33442),
 ('drive', 32560),
 ('enjoy', 31879),
 ('love', 30687),
 ('share', 29408),
 ('simulator', 27254),
 ('picture', 25895),
 ('live', 25421),
 ('search', 25364),
 ('screen', 24828),
 ('wallpaper', 24542),
 ('image', 24409),
 ('puzzle', 24267),
 ('color', 24145),
 ('favorite', 23576),
 ('theme', 23453),
 ('mode', 23236),
 ('support', 22617),
 ('control', 22546),
 ('design', 22481),
 ('music', 22351),
 ('experience', 22342),
 ('create', 21919),
 ('beautiful', 21810),
 ('word', 21792),
 ('player', 21757),
 ('radio', 21712),
 ('include', 21531),
 ('challenge', 21528),
 ('save', 21251),
 ('amazing', 20695),
 ('simple', 20601),
 ('truck', 20429),
 ('sound', 19920),
 ('super', 19708),
 ('skill', 19513),
 ('work', 19306),
 ('fa

In [12]:
processed = [" ".join(doc) for doc in data_words_nostops]

In [13]:
lemmatized_df = pd.DataFrame({"lemmatized_desc": processed})
lemmatized_df["bundle_id"] = bundle_desc["bundle_id"]
lemmatized_df["description"] = bundle_desc["description"]

In [14]:
lemmatized_df.to_csv("lemmatized_df.csv", index=False)
lemmatized_df.head(10)

Unnamed: 0,lemmatized_desc,bundle_id,description
0,block engaging puzzle game simple distinctive ...,1000548274,1010 Block is an engaging puzzle game with a s...
1,hooked fish daily catch experience thrill catc...,1000773093,Get hooked with Rapala® Fishing - Daily Catch!...
2,cannon hero arrive join rocket boy save evil f...,1000958628,THE CANNON HERO HAS ARRIVED. Join Rocket Boy a...
3,time flagship news brand medium group bring la...,1000991178,"Hindustan Times, the flagship news brand of th..."
4,chance delve construction build stuff game fee...,1001013071,This is your chance to delve into the world of...
5,yon twitter apple tahsil apple alan auto_renew,1001115878,TV8 yan ekran uygulamasıyla yayınlarımıza anın...
6,love blackjack play blackjack game store fuss ...,1001387844,Do you love Blackjack? Play the best FREE Blac...
7,,1001402307,本應用包含百家講壇開播至今所有的音頻內容，並且將不斷更新。 ◽《百家講壇》是中央電視臺科教頻...
8,play drift game store unique car fast furious ...,1001425491,Play the #1 drifting game on the App Store wit...
9,epic zombie hibernate year ago ground construc...,1001455591,An Epic Come back of the Zombies hibernating f...
