In [None]:
%pip install pandas
%pip install numpy
%pip install plotly-express

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Loading Dataset

In [3]:
def load_dataset():
    df = pd.read_csv("/home/aziz/Downloads/googleplaystore.csv")
    df.drop(columns=["Last Updated", "Current Ver", "Android Ver"], axis=1, inplace=True)
    return df


apps = load_dataset()

In [4]:
apps[apps["Size"] == "1,000+"]
apps.drop(10472, axis=0, inplace=True)

### The summarize of dataset

In [5]:
def print_summarize_dataset(dataset):
    print("This is full info about dataset")
    print(dataset.info())
    print("This is statistics")
    print(dataset.describe())


print_summarize_dataset(apps)

This is full info about dataset
<class 'pandas.core.frame.DataFrame'>
Index: 10840 entries, 0 to 10840
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10840 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9366 non-null   float64
 3   Reviews         10840 non-null  object 
 4   Size            10840 non-null  object 
 5   Installs        10840 non-null  object 
 6   Type            10839 non-null  object 
 7   Price           10840 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10840 non-null  object 
dtypes: float64(1), object(9)
memory usage: 931.6+ KB
None
This is statistics
            Rating
count  9366.000000
mean      4.191757
std       0.515219
min       1.000000
25%       4.000000
50%       4.300000
75%       4.500000
max       5.000000


In [6]:
def convert_mb(row):
    if type(row) == str:
        if row[-1].lower() == "k":
            row = row.replace('k', '')
            return float(row) / 1024
        elif row[-1].lower() == "m":
            row = row.replace('M', '')
            return float(row)
        elif row.isalpha():
            return None
    return row

In [7]:
def clean_dataset(df):
    filter = {'"': '', ',': '', ';': ' ', 'Everyone 10+': 'Everyone', 'Adults only 18+': 'Adults only',
              'Mature 17+': 'Mature', 'Varies with device': 'NaN', 'Unrated': 'NaN', 'and up': ''}
    df.replace(filter, inplace=True, regex=True)
    df['Rating'] = df['Rating'].apply(lambda rating: 0 if pd.isna(rating) else rating)
    df['Reviews'] = df['Reviews'].astype(float)
    df['Size'] = df['Size'].apply(convert_mb)
    mean_size = df[df['Size'] != "Varies with device"]['Size']
    mean_size = mean_size.astype(float).mean()
    df['Size'] = df['Size'].apply(lambda x: mean_size if x == "Varies with device" else x)
    df.dropna(subset=['Size'], inplace=True)
    df['Installs'] = df['Installs'].str.replace(r'[+]', '', regex=True)
    df['Installs'] = df['Installs'].astype(float)
    try:
        df['Price'] = df['Price'].str.replace('$', '').astype(float)
    except ValueError:
        df['Price'] = df['Price'].replace('$', '').astype(float)
    df['Content Rating'] = df['Content Rating'].str.replace(r'[+]', '', regex=True)
    df['Content Rating'] = df['Content Rating'].str.replace('NaN', 'Unrated')
    df['Popularity'] = df.Rating * 10 ** 7 * df.Reviews * 10 ** 2 * df.Installs
    return df


### Cleaned dataset

In [8]:
df = clean_dataset(apps)
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Popularity
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000.0,Free,0.0,Everyone,Art & Design,6.519000e+15
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000.0,Free,0.0,Everyone,Art & Design Pretend Play,1.885650e+18
2,U Launcher Lite – FREE Live Cool Themes Hide Apps,ART_AND_DESIGN,4.7,87510.0,8.7,5000000.0,Free,0.0,Everyone,Art & Design,2.056485e+21
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25.0,50000000.0,Free,0.0,Teen,Art & Design,4.851990e+22
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000.0,Free,0.0,Everyone,Art & Design Creativity,4.158100e+17
...,...,...,...,...,...,...,...,...,...,...,...
10835,FR Forms,BUSINESS,0.0,0.0,9.6,10.0,Free,0.0,Everyone,Business,0.000000e+00
10836,Sya9a Maroc - FR,FAMILY,4.5,38.0,53.0,5000.0,Free,0.0,Everyone,Education,8.550000e+14
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4.0,3.6,100.0,Free,0.0,Everyone,Education,2.000000e+12
10838,Parkinson Exercices FR,MEDICAL,0.0,3.0,9.5,1000.0,Free,0.0,Everyone,Medical,0.000000e+00


### Correlation of dataset

In [9]:
def compute_correlations_matrix(dataset):
    corr_df = dataset.corr(numeric_only=True)
    fig = px.imshow(corr_df, text_auto=True, width=900, height=900, title='Correlation of dataset')
    fig.show()


compute_correlations_matrix(df)

In [10]:
def print_histograms(dataset):
    fig = px.histogram(dataset, x='Category', y='Installs', barmode='group',
                       title='Total installations of per category')
    fig.show()


print_histograms(df)

In [11]:
def print_scatter_matrix():
    fig = px.scatter(df, x='Genres', y='Reviews', title='Total Reviews of Genres')
    fig.show()


print_scatter_matrix()

### Top 10 paid family categories apps

In [12]:
family_category_paid_apps = apps[(apps['Category'] == 'FAMILY') & (apps['Price'] != 0)]
family_category_paid_apps.sort_values(by='Installs', axis=0, ascending=False, inplace=True)
fig = px.bar(family_category_paid_apps.head(15), x='App', y='Installs', color='App',
             title='The most populars paid apps of Family category')
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





### The most popular genres

In [13]:
family_category_paid_genre = apps[(apps['Category'] == 'FAMILY') & (apps['Price'] != 0)]
family_category_paid_genre.sort_values(by='Installs', axis=0, ascending=False, inplace=True)
fig = px.pie(family_category_paid_genre.head(20), names='Genres', color='Genres', title='The most popular genres',
             width=900, height=900)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### The number of apps installation per category

In [14]:
per_df = df.groupby(by='Category').sum().sort_values(by='Installs', ascending=False)['Installs'].head(15)
fig = px.pie(per_df, names=per_df.index, values='Installs', title='A pie with the number of installation per category',
             color=per_df.index, width=800, height=800)
fig.show()

### Mean price of per category

In [15]:
mean_price_per_category = df.groupby(by="Category").mean(numeric_only=True)['Price'].sort_values(ascending=False).head(
    10)
fig = px.bar(mean_price_per_category, x=mean_price_per_category.index, y='Price', color=mean_price_per_category,
             title='Mean price of per category')
fig.show()

### The most expensive apps of per category

In [16]:
most_expensive = df.sort_values(by='Price', axis=0, ascending=False)
fig = px.sunburst(most_expensive, path=['Category', 'App'], values='Price', color='Genres',
                  title='The most expensive apps of per category', width=800, height=800)
fig.show()