# Welcome to Dataskillet!

In [1]:
import os
os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
import modin.pandas as pd



In [2]:
from dataskillet import DataSource

In [3]:
%load_ext autoreload
%autoreload 2

Lets download some csvs

In [4]:
os.mkdir('testdrive_csvs')

FileExistsError: [Errno 17] File exists: 'testdrive_csvs'

In [5]:
!curl https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore.csv -o testdrive_csvs/googleplaystore.csv
!curl https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore_user_reviews.csv -o testdrive_csvs/googleplaystore_user_reviews.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1328k  100 1328k    0     0  1860k      0 --:--:-- --:--:-- --:--:-- 1857k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7489k  100 7489k    0     0  5977k      0  0:00:01  0:00:01 --:--:-- 5977k


# The dataset contains google play apps and their user reviews about them

In [6]:
googleplaystore = pd.read_csv('testdrive_csvs/googleplaystore.csv')
googleplaystore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [7]:
googleplaystore_user_reviews = pd.read_csv('testdrive_csvs/googleplaystore_user_reviews.csv')
googleplaystore_user_reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


# Creating the DataSource

DataSource is the analogus of database. It stores information about tables and accepts queries to them.

On creation, we need to specify a directory to store table metadata for the DataSource.

In [8]:
metadata_dir = 'testdrive_metadata'
os.mkdir(metadata_dir)

FileExistsError: [Errno 17] File exists: 'testdrive_metadata'

In [28]:
ds = DataSource(metadata_dir=metadata_dir)

The metadata dir now stores information about tables. However we currently have no tables. Lets create some

In [29]:
ds.query('SHOW TABLES')



Unnamed: 0,name,fpath
0,googleplaystore,testdrive_csvs/googleplaystore.csv
1,googleplaystore_user_reviews,testdrive_csvs/googleplaystore_user_reviews.csv


# Creating tables

A `Table` is an abstraction over a dataframe. It loads the dataframe on-demand. On creation it applies simple preprocessings. The preprocessings are created once and stored in metadata, so they are always performed the same way when a dataframe is loaded.

In [30]:
ds.query(f'CREATE TABLE ("testdrive_csvs/googleplaystore.csv")')

Exception: Table googleplaystore already exists in data source, use DROP TABLE to remove it if you want to recreate it.

In [None]:
ds.query(f'CREATE TABLE ("testdrive_csvs/googleplaystore_user_reviews.csv")')

In [None]:
ds.query('SHOW TABLES')

Now that we have some tables, the information about them is stored in metadata. 

If we recreate the datasource using the same `metadata_dir`, we don't need to add the tables again.

In [31]:
ds = DataSource(metadata_dir=metadata_dir)

In [32]:
ds.query('SHOW TABLES')



Unnamed: 0,name,fpath
0,googleplaystore,testdrive_csvs/googleplaystore.csv
1,googleplaystore_user_reviews,testdrive_csvs/googleplaystore_user_reviews.csv


If we need it, we can clear the metadata and recreate the DataSource using `DataSource.create_new(metadata_dir)`

# Querying

In [33]:
ds.query('SELECT * FROM googleplaystore LIMIT 5')



Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [34]:
ds.query("SELECT app FROM googleplaystore LIMIT 5")



Unnamed: 0,app
0,Photo Editor & Candy Camera & Grid & ScrapBook
1,Coloring book moana
2,"U Launcher Lite – FREE Live Cool Themes, Hide ..."
3,Sketch - Draw & Paint
4,Pixel Draw - Number Art Coloring Book


In [35]:
ds.query("SELECT CAST(price as int) as price_int FROM googleplaystore WHERE price = '0' LIMIT 5")



Unnamed: 0,price_int
0,0
1,0
2,0
3,0
4,0


In [36]:
ds.query("SELECT app, category FROM googleplaystore WHERE price = '0'")



Unnamed: 0,app,category
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN
1,Coloring book moana,ART_AND_DESIGN
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN
3,Sketch - Draw & Paint,ART_AND_DESIGN
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN
...,...,...
10836,Sya9a Maroc - FR,FAMILY
10837,Fr. Mike Schmitz Audio Teachings,FAMILY
10838,Parkinson Exercices FR,MEDICAL
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE


In [37]:
ds.query("SELECT * FROM googleplaystore WHERE category = 'FAMILY' AND price = '0' AND CAST(reviews as int) > 5 ORDER BY reviews DESC LIMIT 100")



ValueError: invalid literal for int() with base 10: '3.0M'

In [38]:
ds.query("SELECT app, sentiment, sentiment_polarity FROM googleplaystore INNER JOIN googleplaystore_user_reviews ON googleplaystore.app = googleplaystore_user_reviews.app LIMIT 10")



Unnamed: 0,app,sentiment,sentiment_polarity
0,Coloring book moana,Negative,-0.25
1,Coloring book moana,Negative,-0.725
2,Coloring book moana,Neutral,0.0
3,Coloring book moana,,
4,Coloring book moana,Positive,0.5
5,Coloring book moana,Negative,-0.8
6,Coloring book moana,,
7,Coloring book moana,Neutral,0.0
8,Coloring book moana,Positive,0.5
9,Coloring book moana,Positive,0.5


In [116]:
sql = """
SELECT app, avg(sentiment_polarity) as avg_sentiment_polarity
FROM (
    SELECT app, sentiment, sentiment_polarity 
    FROM googleplaystore INNER JOIN googleplaystore_user_reviews ON googleplaystore.app = googleplaystore_user_reviews.app 
) sub
GROUP BY app
HAVING CAST(avg_sentiment_polarity as float) > 0.4
LIMIT 10
"""
ds.query(sql)



Unnamed: 0,app,avg_sentiment_polarity
0,10 Best Foods for You,0.470733
4,2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif,0.449566
8,2RedBeans,0.412199
11,365Scores - Live Scores,0.438312
14,3D Live Neon Weed Launcher,0.568182
19,7 Minute Workout,0.410278
22,850 Sports News Digest,0.54286
43,APE Weather ( Live Forecast),0.432323
44,"APUS Launcher - Theme, Wallpaper, Hide Apps",0.447473
52,ASUS Sound Recorder,0.516771


In [118]:
sql = """
SELECT count(app) as count_apps
FROM (
    SELECT app
    FROM googleplaystore 
) sub
"""
ds.query(sql)



10841

In [130]:
sql = """
SELECT category, count(app) as count_apps
FROM (
    SELECT category, app FROM googleplaystore 
) sub
GROUP BY category
"""
ds.query(sql)



Unnamed: 0,category,count_apps
0,1.9,1
1,ART_AND_DESIGN,65
2,AUTO_AND_VEHICLES,85
3,BEAUTY,53
4,BOOKS_AND_REFERENCE,231
5,BUSINESS,460
6,COMICS,60
7,COMMUNICATION,387
8,DATING,234
9,EDUCATION,156


In [None]:
sql = """
SELECT count(category) as uniq_categories
FROM (
    SELECT DISTINCT category FROM googleplaystore 
) sub
"""
ds.query(sql)

