# agaricus-lepiota Data

In [1]:
# import the library
import pandas as pd

col_names = ['classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 
             'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 
             'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 
             'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 
             'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

# Read the dataset
agaricus_df = pd.read_csv('agaricus-lepiota.data',names=col_names,na_values = "?")

In [2]:
# number of missing values in each row
agaricus_df.isnull().sum(axis=1)

0       0
1       0
2       0
3       0
4       0
       ..
8119    1
8120    1
8121    1
8122    1
8123    1
Length: 8124, dtype: int64

In [3]:
# number of missing values in each column
agaricus_df.isnull().sum(axis=0)

classes                        0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises?                       0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [4]:
# Removes the column that contains missing values.

agaricus_df.dropna(axis=1,inplace=True)

In [5]:
# final cleaned dataframe
# display few rows
agaricus_df.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


# googleplay Data

In [6]:
# read the dataset
google_df = pd.read_csv('googleplaystore.csv')

In [7]:
# display few rows
google_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [8]:
# check which columns contain missing values
print(google_df.isnull().any())

App               False
Category          False
Rating             True
Reviews           False
Size              False
Installs          False
Type               True
Price             False
Content Rating     True
Genres            False
Last Updated      False
Current Ver        True
Android Ver        True
dtype: bool


In [9]:
# number of missing values in each column
google_df.isnull().sum(axis=0)

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [10]:
# column containing more than 100 missing values
for c in google_df.columns:
    total_missing = google_df[c].isnull().sum()
    if total_missing > 100:
        print('Column: {} has {} missing values i.e. more than 100'.format(c,total_missing))

Column: Rating has 1474 missing values i.e. more than 100


In [11]:
# number of row and column before dropping rows/columns
row,col = google_df.shape

print('Row:',row)
print('Column:',col)

Row: 10841
Column: 13


In [12]:
# drop Rating column

google_df.drop('Rating',axis=1,inplace=True)

In [13]:
# display few rows
google_df.head()

Unnamed: 0,App,Category,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [14]:
# Remove any other rows that contain missing values
google_df.dropna(inplace=True)

In [15]:
# How many rows & columns are left 
row,col = google_df.shape

print('Row:',row)
print('Column:',col)

Row: 10829
Column: 12
