In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/google-play-store-apps/googleplaystore.csv")
df.head(3)

In [None]:
df.columns

In [None]:
df.info()
df.shape

In [None]:
def missing_value_of_data(df):
    total=df.isnull().sum().sort_values(ascending=False)
    percentage=round(total/df.shape[0]*100,2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])


missing_value_of_data(df)

Each column will be explored individually.
1. [App](#app)
2. [Category](#cat)
3. [Rating](#rating)
4. [Reviews](#reviews)
5. [Type](#type)
6. [Size](#size)
7. [Installs](#installs)
8. [Content Rating](#content_rating)
9. [Genres](#genres)
10. [Current Version](#current_version)
11. [Android Version](#android_version)

# <a id=app>App</a>

In [None]:
df["App"].nunique()

In [None]:
df["App"].value_counts()

In [None]:
df[df["App"] == "ROBLOX"]

In [None]:
df.drop_duplicates(subset = "App", inplace = True)

In [None]:
df["App"].value_counts()

In [None]:
df.shape

# <a id=cat>Category</a>

In [None]:
df["Category"].value_counts()

Obviously, one of the cells contains an unexpected category which is "1.9".

In [None]:
df[df["Category"] == "1.9"]

So, data in this row is not correct, I change it below.

In [None]:
df.at[10472, "Category"] = "PHOTOGRAPHY"
df.at[10472, "Installs"] = "1,000,000+"
df.at[10472, "Price"] = "0"
df.at[10472, "Last Updated"] = "July 20, 2018"
df.at[10472, "Current Ver"] = "1.0.19"
df.at[10472, "Android Ver"] = "4.0 and up"

In [None]:
df[df["App"] == "Life Made WI-Fi Touchscreen Photo Frame"]

In [None]:
df["Category"].value_counts()

# <a id=rating>Rating</a>

In [None]:
df["Rating"].value_counts()

In [None]:
df[df["Rating"] == 19.0]

In [None]:
average_rating = df["Rating"].mean()
df.at[10472, "Rating"] = round(average_rating, 1)

In [None]:
df.loc[df["App"] == "Life Made WI-Fi Touchscreen Photo Frame"]

In [None]:
df["Rating"].isnull().sum()

In [None]:
df["Rating"] = df.groupby("Category").transform(lambda x: x.fillna(round(x.mean(),1)))

# <a id=reviews>Reviews</a>

In [None]:
df["Reviews"].value_counts()

# <a id=type>Type</a>

In [None]:
df["Type"].value_counts()

In [None]:
df.loc[df["Type"] == "0"]

In [None]:
df.at[10472, "Type"] = "Free"
df["Type"].value_counts()

In [None]:
df["Type"].isnull().sum()

Let's find the nan, now. Then, change it to "Free", becuase most of the apps are free.


In [None]:
df[df["Type"].isnull() == True]

In [None]:
df.at[9148, "Type"] = "Free"

In [None]:
labels = df["Type"].value_counts().index
sizes = df["Type"].value_counts()
explode = (0, 0.1)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=30)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title("Percentage of Free and Paid Apps")
plt.show()

# <a id=size>Size</a>

In [None]:
df["Size"].value_counts()

In [None]:
df["Size"].isnull().sum()

In [None]:
df["Size"].unique()

Size of apps contains prefixes for Kilo and Mega. They should be converted into float and kept in same unit.


In [None]:
def size_convert(i):
    if "M" in i:
        return float(i[:-1]) * 1000
    elif "K" in i:
        return float(i[:-1])
    else:
        return i
df["Size"] = df["Size"].apply(size_convert)

# <a id=installs>Installs</a>

In [None]:
df["Installs"].isnull().sum()

In [None]:
df["Installs"].unique()

Values in string format, so they should be converted into integers. Thus, "+" and "," should be removed first, then convert into integers.

In [None]:
df["Installs"] = df["Installs"].apply(lambda x: x.replace(',',''))
df["Installs"] = df["Installs"].apply(lambda x: x.replace('+',''))
df["Installs"] = df["Installs"].apply(lambda x: int(x))

In [None]:
sorted_value = sorted(list(df["Installs"].unique()))
df["Installs"].replace(sorted_value,range(0,len(sorted_value),1), inplace = True )
plt.figure(figsize = (10,10))
sns.regplot(x = "Installs", y = "Rating", color = 'r',data=df);
plt.title('Rating vs Installs',size = 20)

In [None]:
df.info()

# <a id=content_rating>Content Rating</a>

In [None]:
df["Content Rating"].isnull().sum()

In [None]:
df["Content Rating"].unique()

In [None]:
df[df["Content Rating"].isnull() == True]

In [None]:
df.at[10472, "Content Rating"] = "Everyone"

# <a id=genres>Genres</a>

In [None]:
df["Genres"].unique()

In [None]:
df.loc[df["Genres"] == "February 11, 2018"]

In [None]:
df.at[10472, "Genres"] = "Photography"

# <a id=current_version>Current Version</a>

In [None]:
df["Current Ver"].nunique()

In [None]:
df["Current Ver"].isnull().sum()

In [None]:
df["Current Ver"].fillna("1.0", inplace = True)

# <a id=android_version>Android Version</a>

In [None]:
df["Android Ver"].unique()

In [None]:
df["Android Ver"].value_counts()

In order to have a clean result, all data should be like "4.0 and up", instead of this.

4.0.3 - 7.1.1 2 5.0 - 8.0 2 7.0 - 7.1.1 1 4.1 - 7.1.1 1 5.0 - 7.1.1 1 5.0 - 6.0 1 2.2 - 7.1.1 1

In [None]:
def and_version(i):
    if str(i) == "4.4W and up":
        return "4.4 and up"
    elif "-" in str(i):
        return str(i.split(" ")[0]) + " and up"
    else:
        return i

df["Android Ver"] = df["Android Ver"].apply(and_version)

In [None]:
df["Android Ver"].value_counts()

In [None]:
df.loc[df["Android Ver"].isnull() == True]

In [None]:
df.at[10472, "Price"] = "0"
df.at[10472, "Last Updated"] = "July 20, 2018"
df.at[10472, "Current Ver"] = "1.0.19"
df.at[10472, "Android Ver"] = "4.0 and up"
df["Android Ver"].fillna("4.0 and up", inplace = True)
df.info()

## Thank you and good luck everyone! Please support, comment and vote :)