## Missing Data
There will be times, when the data you  have imported has missing values, NaN or zero values. You can take care of that be following any/some of these steps.

In [1]:
#import pandas
import pandas as pd

## Reading a CSV

In [2]:
car_dataset_url ='https://raw.githubusercontent.com/ankitind/sample_datasets/master/car_ad.csv'
car_ads = pd.read_csv(car_dataset_url, header=0)




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9576 entries, 0 to 9575
Data columns (total 10 columns):
car             9576 non-null object
price           9576 non-null float64
body            9576 non-null object
mileage         9576 non-null int64
engV            9142 non-null float64
engType         9576 non-null object
registration    9576 non-null object
year            9576 non-null int64
model           9576 non-null object
drive           9065 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 748.2+ KB


## Lets understand the data first

In [6]:
car_ads.dropna().describe()

Unnamed: 0,price,mileage,engV,year
count,8739.0,8739.0,8739.0,8739.0
mean,15733.542261,140.095434,2.588607,2006.609681
std,24252.90481,97.892213,5.41667,6.968947
min,0.0,0.0,0.1,1959.0
25%,5000.0,71.0,1.6,2004.0
50%,9250.0,130.0,2.0,2008.0
75%,16800.0,195.5,2.5,2012.0
max,547800.0,999.0,99.99,2016.0


In [21]:
car_ads.dropna().describe(include=['object'])

Unnamed: 0,car,body,engType,registration,model,drive
count,8739,8739,8739,8739,8739,8739
unique,83,6,4,2,827,3
top,Volkswagen,sedan,Petrol,yes,E-Class,front
freq,860,3321,4065,8236,182,4973


In [10]:
#Shape will give a tuple to tell how many rows and columns
car_ads.shape


(9576, 10)

In [15]:
car_ads.dtypes

car              object
price           float64
body             object
mileage           int64
engV            float64
engType          object
registration     object
year              int64
model            object
drive            object
dtype: object

### Finding rows(observations) with null or NaN values

In [249]:

car_ads[pd.isnull(car_ads['drive'])].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 511 entries, 4 to 9566
Data columns (total 10 columns):
car             511 non-null object
price           511 non-null float64
body            511 non-null object
mileage         511 non-null int64
engV            403 non-null float64
engType         511 non-null object
registration    511 non-null object
year            511 non-null int64
model           511 non-null object
drive           0 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 43.9+ KB


In [None]:
car_ads[pd.isnull(car_ads['drive'])]




### Columns than have all non-zero Values

In [266]:
car_ads.all()

car              True
price           False
body             True
mileage         False
engV             True
engType          True
registration     True
year             True
model            True
drive            True
dtype: bool

### Columns than have any non-zero Values

In [267]:
car_ads.any()

car             True
price           True
body            True
mileage         True
engV            True
engType         True
registration    True
year            True
model           True
drive           True
dtype: bool

### Which rows or columns have NaN

In [269]:
car_ads.isnull().any()

car             False
price           False
body            False
mileage         False
engV             True
engType         False
registration    False
year            False
model           False
drive            True
dtype: bool

### Which rows or columns have no Nan

In [271]:
car_ads.notnull().all()

car              True
price            True
body             True
mileage          True
engV            False
engType          True
registration     True
year             True
model            True
drive           False
dtype: bool

### Drop Rows with NaN dropna()

In [289]:
car_ads_no_missing_dropped_any = car_ads.dropna(how='any')
car_ads_no_missing_dropped_all = car_ads.dropna(how='all')
car_ads_no_missing_dropped_any.info()
print("---")
car_ads_no_missing_dropped_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8739 entries, 0 to 9575
Data columns (total 10 columns):
car             8739 non-null object
price           8739 non-null float64
body            8739 non-null object
mileage         8739 non-null int64
engV            8739 non-null float64
engType         8739 non-null object
registration    8739 non-null object
year            8739 non-null int64
model           8739 non-null object
drive           8739 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 751.0+ KB
---
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9576 entries, 0 to 9575
Data columns (total 10 columns):
car             9576 non-null object
price           9576 non-null float64
body            9576 non-null object
mileage         9576 non-null int64
engV            9142 non-null float64
engType         9576 non-null object
registration    9576 non-null object
year            9576 non-null int64
model           9576 non-null object
drive          

### Using Thresholf in dataframes
thresh= keyword argument to drop columns from the full dataset that have more than 1000 missing values.

In [318]:
print(car_ads.head())
print("---")
car_ads_no_missing_dropped_col = car_ads.dropna(thresh=9500, axis='columns')
print(car_ads_no_missing_dropped_col.head())

             car    price       body  mileage  engV engType registration  \
0           Ford  15500.0  crossover       68   2.5     Gas          yes   
1  Mercedes-Benz  20500.0      sedan      173   1.8     Gas          yes   
2  Mercedes-Benz  35000.0      other      135   5.5  Petrol          yes   
3  Mercedes-Benz  17800.0        van      162   1.8  Diesel          yes   
4  Mercedes-Benz  33000.0      vagon       91   NaN   Other          yes   

   year    model  drive  
0  2010     Kuga   full  
1  2011  E-Class   rear  
2  2008   CL 550   rear  
3  2012    B 180  front  
4  2013  E-Class    NaN  
---
             car    price       body  mileage engType registration  year  \
0           Ford  15500.0  crossover       68     Gas          yes  2010   
1  Mercedes-Benz  20500.0      sedan      173     Gas          yes  2011   
2  Mercedes-Benz  35000.0      other      135  Petrol          yes  2008   
3  Mercedes-Benz  17800.0        van      162  Diesel          yes  2012   
4  