# 🧨2. Data Transformation

In [8]:
import pandas as pd
import numpy as np 

In [9]:
Data = pd.DataFrame({'k_1':['one','two','three']*2+['two','three'],'k_2':[1,2,3,4,5,6,5,6]})
Data

Unnamed: 0,k_1,k_2
0,one,1
1,two,2
2,three,3
3,one,4
4,two,5
5,three,6
6,two,5
7,three,6


In [10]:
#duplicate value
Data.duplicated() 

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool

In [11]:
#removing duplicate data 
Data.drop_duplicates()

Unnamed: 0,k_1,k_2
0,one,1
1,two,2
2,three,3
3,one,4
4,two,5
5,three,6


In [12]:
Data['V_1']=np.arange(8)
Data

Unnamed: 0,k_1,k_2,V_1
0,one,1,0
1,two,2,1
2,three,3,2
3,one,4,3
4,two,5,4
5,three,6,5
6,two,5,6
7,three,6,7


In [13]:
#by using subset we can delete dulicate row 
Data.drop_duplicates(subset=['k_1'])

Unnamed: 0,k_1,k_2,V_1
0,one,1,0
1,two,2,1
2,three,3,2


In [14]:
#only keep here the last unique value 
#must be stay unique values in both row 
Data.drop_duplicates(subset=['k_2','k_2'],keep='last')

#here we can see 4 and 5th index is deleted  

Unnamed: 0,k_1,k_2,V_1
0,one,1,0
1,two,2,1
2,three,3,2
3,one,4,3
6,two,5,6
7,three,6,7


In [15]:
Data=pd.DataFrame({'food':['bacon','pulled cow','bacon','pastrami','corned beef','bacon','pastrami','honey ham','nova lox'],
                   'ounces':[4,3,12,8,7.5,8,3,5,8]})
Data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled cow,3.0
2,bacon,12.0
3,pastrami,8.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,8.0


In [16]:
#creating a set for maping 
meat_to_animal={
    'bacon':'cow',
    'pulled beef':'cow',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'mutton',
    'nova lox':'salmon'
}

In [17]:
#by maping we can give name of each food 
Data['animal'] = Data['food'].map(meat_to_animal)
Data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,cow
1,pulled cow,3.0,
2,bacon,12.0,cow
3,pastrami,8.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,cow
6,pastrami,3.0,cow
7,honey ham,5.0,mutton
8,nova lox,8.0,salmon


In [18]:
Data_2= pd.DataFrame([1,1,1,2,3,4,5,1])
Data_2

Unnamed: 0,0
0,1
1,1
2,1
3,2
4,3
5,4
6,5
7,1


In [19]:
#replacing data
#[1,2] = row 
#[None,np.nan] = replacing data
Data_2.replace([1,2],[None,np.nan])

Unnamed: 0,0
0,
1,
2,
3,
4,3.0
5,4.0
6,5.0
7,


In [20]:
#best way is using dictionary for replace data
#1 & 2 is key
# None & np.nan is value 
Data_2.replace({1:None ,
                2:np.nan})

Unnamed: 0,0
0,
1,
2,
3,
4,3.0
5,4.0
6,5.0
7,


In [21]:
#Renaming Axis Indexes
Data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['apple','black','orange','doctor'],
                    columns=['country','Bangladesh','dhaka','axeor'])
Data

Unnamed: 0,country,Bangladesh,dhaka,axeor
apple,0,1,2,3
black,4,5,6,7
orange,8,9,10,11
doctor,12,13,14,15


In [22]:
#creating a function
def transformation(i):
  return i[:3].upper() 

In [23]:
Data.index = Data.index.map(transformation)
Data

Unnamed: 0,country,Bangladesh,dhaka,axeor
APP,0,1,2,3
BLA,4,5,6,7
ORA,8,9,10,11
DOC,12,13,14,15


In [24]:
Data.rename(columns= str.upper)

Unnamed: 0,COUNTRY,BANGLADESH,DHAKA,AXEOR
APP,0,1,2,3
BLA,4,5,6,7
ORA,8,9,10,11
DOC,12,13,14,15


#     🧲2.1 Discretization and Binning

In [25]:
ages = [8,10,25,27,12,15,20,65,30]

In [29]:
bins = [5,10,15,25,35,60,100]

In [30]:
#ages are random valus 
#bins are range
#by pd.cut we make a categorical data between ages and bins 
age_categories = pd.cut(ages,bins)
age_categories

[(5, 10], (5, 10], (15, 25], (25, 35], (10, 15], (10, 15], (15, 25], (60, 100], (25, 35]]
Categories (6, interval[int64, right]): [(5, 10] < (10, 15] < (15, 25] < (25, 35] < (35, 60] <
                                         (60, 100]]

In [31]:
#categories indexing num
age_categories.codes

array([0, 0, 2, 3, 1, 1, 2, 5, 3], dtype=int8)

In [32]:
#we can see the valus of categories
age_categories.categories[2]

Interval(15, 25, closed='right')

In [35]:
#how many value exist in a category 
pd.value_counts(age_categories)

(5, 10]      2
(10, 15]     2
(15, 25]     2
(25, 35]     2
(60, 100]    1
(35, 60]     0
dtype: int64

In [42]:
group_names = ['child','youth','adult','midage','old','Senior']

In [43]:
#this valus r converted into names
label_count = pd.cut(ages,bins,labels=group_names)
label_count

['child', 'child', 'adult', 'midage', 'youth', 'youth', 'adult', 'Senior', 'midage']
Categories (6, object): ['child' < 'youth' < 'adult' < 'midage' < 'old' < 'Senior']

In [44]:
pd.value_counts(label_count)

child     2
youth     2
adult     2
midage    2
Senior    1
old       0
dtype: int64

In [46]:
#taking random set
Data = np.random.uniform(size=12)
Data

array([0.06744401, 0.73598825, 0.0993001 , 0.45102449, 0.61750687,
       0.07480322, 0.66860447, 0.74117192, 0.3295165 , 0.03194881,
       0.0703912 , 0.58177249])

In [47]:
#The precision=3 option limits the decimal precision to three digits
pd.cut(Data,3,precision=3)

[(0.0312, 0.268], (0.505, 0.741], (0.0312, 0.268], (0.268, 0.505], (0.505, 0.741], ..., (0.505, 0.741], (0.268, 0.505], (0.0312, 0.268], (0.0312, 0.268], (0.505, 0.741]]
Length: 12
Categories (3, interval[float64, right]): [(0.0312, 0.268] < (0.268, 0.505] < (0.505, 0.741]]

In [50]:
Data = np.random.standard_normal(1000)

In [53]:
quartiles = pd.cut(Data,4,precision=5)
quartiles

[(-0.056421, 1.34566], (-0.056421, 1.34566], (-0.056421, 1.34566], (-1.4585, -0.056421], (-0.056421, 1.34566], ..., (1.34566, 2.74774], (-1.4585, -0.056421], (-0.056421, 1.34566], (-1.4585, -0.056421], (-1.4585, -0.056421]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.8662, -1.4585] < (-1.4585, -0.056421] <
                                           (-0.056421, 1.34566] < (1.34566, 2.74774]]

In [54]:
pd.value_counts(quartiles)

(-1.4585, -0.056421]    419
(-0.056421, 1.34566]    418
(1.34566, 2.74774]       85
(-2.8662, -1.4585]       78
dtype: int64


# 🧲2.2 Detecting and Filtering Outliers

🙌An outlier is an observation that lies an abnormal distance from other values in a random sample from a population

In [60]:
Data= pd.DataFrame(np.random.standard_normal((1000,4)))
Data

Unnamed: 0,0,1,2,3
0,0.199376,-0.135904,1.623214,1.361583
1,1.116802,-0.723841,0.948127,-0.465608
2,-0.359839,0.467387,0.691446,1.465105
3,0.322101,0.339612,0.148619,-0.977162
4,0.914980,-0.634637,1.792981,-0.127364
...,...,...,...,...
995,0.896205,-0.921624,0.275709,-0.879474
996,-0.816697,2.079645,-0.263244,-0.395467
997,0.896777,0.630916,-0.129683,-0.894396
998,0.729816,-0.166779,0.574505,-0.418157


In [63]:
Data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.002929,0.03191,0.017803,0.003296
std,0.981971,0.991163,0.969263,0.970169
min,-2.763655,-2.743913,-2.990582,-3.148702
25%,-0.7169,-0.609999,-0.614357,-0.637509
50%,0.00644,0.034561,0.049222,-0.007025
75%,0.645255,0.67127,0.65371,0.646994
max,3.104615,3.197736,3.422964,2.704315


In [65]:
col = Data[2]
col

0      1.623214
1      0.948127
2      0.691446
3      0.148619
4      1.792981
         ...   
995    0.275709
996   -0.263244
997   -0.129683
998    0.574505
999    0.549896
Name: 2, Length: 1000, dtype: float64

In [67]:
#An outlier is an observation that lies an abnormal distance from other values 
#in a random sample from a population
#and make every values into positive
col[col.abs() >3 ]

431    3.422964
Name: 2, dtype: float64

In [75]:
#in a data frame we comparing the outlier
Data[(Data.abs()>3).any(axis='columns')]

Unnamed: 0,0,1,2,3
38,-0.514173,1.435674,0.094019,-3.04774
336,3.104615,-0.061077,-0.609002,0.04721
431,-0.794841,-0.518359,3.422964,1.326699
742,-0.905955,3.197736,0.645844,-0.656925
917,-0.299174,-0.378838,0.606169,-3.148702


In [76]:
Data[Data.abs()>3] = np.sign(Data)*3
Data

Unnamed: 0,0,1,2,3
0,0.199376,-0.135904,1.623214,1.361583
1,1.116802,-0.723841,0.948127,-0.465608
2,-0.359839,0.467387,0.691446,1.465105
3,0.322101,0.339612,0.148619,-0.977162
4,0.914980,-0.634637,1.792981,-0.127364
...,...,...,...,...
995,0.896205,-0.921624,0.275709,-0.879474
996,-0.816697,2.079645,-0.263244,-0.395467
997,0.896777,0.630916,-0.129683,-0.894396
998,0.729816,-0.166779,0.574505,-0.418157


In [77]:
#the statement np.sign(data) produces 1 and -1 values based on whether the values is data are positive 
np.sign(Data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,-1.0,1.0,-1.0
2,-1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,-1.0
4,1.0,-1.0,1.0,-1.0
