# **CHAPTER 7**
# **Data Cleaning and Preparation**

## **7.1 Handling Missing Data**

In [1]:
import numpy as np
import pandas as pd

In [2]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0]=None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### **Filtering Out Missing Data**

In [13]:
from numpy import nan as NA

In [16]:
data=pd.Series([1,Na,3.5,NA,7])

In [17]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [19]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [20]:
data=pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

In [21]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [23]:
cleaned=data.dropna()#NA olan bütün satırları düşürdü. Bir tane bile NA değer varsa bütün satırı düşürür

In [24]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [25]:
data.dropna(how='all')# bütün satır NA ise satırı düşürdü

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [26]:
data[4]=NA#4.sütun hepsi NaN değerlerden oluşan

In [27]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [28]:
data.dropna(axis=1,how='all')# Bütün değerleri Nan olan sütunu düşürdü

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [35]:
df=pd.DataFrame(np.random.randn(7,3))

In [36]:
df.iloc[:4,1]=NA# 0,1,2,3 satır daki 1.sütundaki değerleri Na yapıyor 

In [37]:
df.iloc[:2,2]=NA# 0,1 satırdaki 2.sütundaki

In [33]:
df

Unnamed: 0,0,1,2
0,0.011762,,
1,-1.133186,,
2,1.409459,,1.983643
3,1.307372,,-0.852935
4,0.562114,-0.317223,-1.925702
5,-0.170206,0.449144,-1.205368
6,-0.061716,0.155743,-0.527393


In [34]:
df.dropna()

Unnamed: 0,0,1,2
4,0.562114,-0.317223,-1.925702
5,-0.170206,0.449144,-1.205368
6,-0.061716,0.155743,-0.527393


In [38]:
df.dropna(thresh=2)# Yalnızca Na olmayan en az 2 değere sahip satırları tutar

Unnamed: 0,0,1,2
2,-1.286676,,0.027618
3,-0.872796,,0.422522
4,-0.847144,2.436101,-0.18791
5,-0.616959,-1.014195,-0.89731
6,0.049603,-0.959031,-0.638575


### **Filling In Missing Data**

In [39]:
df.fillna(0)# Na değerleri 0 ile değiştiriyor

Unnamed: 0,0,1,2
0,0.548111,0.0,0.0
1,-0.02317,0.0,0.0
2,-1.286676,0.0,0.027618
3,-0.872796,0.0,0.422522
4,-0.847144,2.436101,-0.18791
5,-0.616959,-1.014195,-0.89731
6,0.049603,-0.959031,-0.638575


In [40]:
df.fillna({1:0.5,2:0})# 1.sütundaki Na değerleri 0.5 ile 2 sütundaki verileri 0 ile değiştiriyor

Unnamed: 0,0,1,2
0,0.548111,0.5,0.0
1,-0.02317,0.5,0.0
2,-1.286676,0.5,0.027618
3,-0.872796,0.5,0.422522
4,-0.847144,2.436101,-0.18791
5,-0.616959,-1.014195,-0.89731
6,0.049603,-0.959031,-0.638575


In [41]:
_=df.fillna(0,inplace=True)# Kalıcı değişiklik yapıyor

In [42]:
df

Unnamed: 0,0,1,2
0,0.548111,0.0,0.0
1,-0.02317,0.0,0.0
2,-1.286676,0.0,0.027618
3,-0.872796,0.0,0.422522
4,-0.847144,2.436101,-0.18791
5,-0.616959,-1.014195,-0.89731
6,0.049603,-0.959031,-0.638575


In [49]:
df=pd.DataFrame(np.random.randn(6,3))

In [50]:
df.iloc[2:,1]=NA
df.iloc[4:,2]=NA

In [51]:
df

Unnamed: 0,0,1,2
0,-0.165535,-1.314856,0.74642
1,0.485068,0.457917,-0.705274
2,-0.105295,,0.551196
3,-1.843462,,-0.316525
4,0.412626,,
5,-0.176053,,


In [48]:
df.fillna(method='ffill')# boş olmayan değerleri ileriye veya geriye doğru yayabiliriz.

Unnamed: 0,0,1,2
0,-0.773331,1.570069,-0.517912
1,0.828808,0.447881,-1.07343
2,0.890076,0.447881,-0.487047
3,-0.471305,0.447881,-0.345904
4,1.192764,0.447881,-0.345904
5,0.070399,0.447881,-0.345904


In [53]:
df.fillna(method='ffill',limit=2)# Yalnızca ilk iki Nan değerini değiştirir

Unnamed: 0,0,1,2
0,-0.165535,-1.314856,0.74642
1,0.485068,0.457917,-0.705274
2,-0.105295,0.457917,0.551196
3,-1.843462,0.457917,-0.316525
4,0.412626,,-0.316525
5,-0.176053,,-0.316525


In [54]:
data=pd.Series([1.,NA,3.5,NA,7])

In [56]:
data.fillna(data.mean())# Nan değerleri ortalaması ile değiştirir (1+3.5+7)/3=3.83

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

 ## **7.2 Data Transformation**

### **Removing Duplicates**

In [57]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1,1,2,3,3,4,4]})

In [58]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [59]:
data.duplicated()# bir önceki satırda aynı değer varmı yok mu onu döndürüyor 

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [60]:
data.drop_duplicates()# Tekrar eden satırı silerek döndürür 

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [61]:
data['v1']=range(7)

In [62]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [63]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [64]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### **Transforming Data Using a Function or Mapping**