In [1]:
#Data Transformation
import numpy as np
import pandas as pd

In [2]:
#Removing duplicates

dframe = pd.DataFrame({'color':["white","white","red","red","white"],
                      'value':[2,1,3,3,2]})
print(dframe,"\n")
print(dframe.duplicated()) #print bool for duplicated row
# duplicated row = exactly same color, value pair


   color  value
0  white      2
1  white      1
2    red      3
3    red      3
4  white      2 

0    False
1    False
2    False
3     True
4     True
dtype: bool


In [3]:
dframe[dframe.duplicated()] #shows only duplicated entries

Unnamed: 0,color,value
3,red,3
4,white,2


In [4]:
dframe.drop_duplicates() #shows non duplicated entries.

Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3


In [5]:
#Mapping: process of changning lable values

#dictionaries are used for mapping
map ={
    'label1' : "value1",
    'label2' : "value2"
}

#replace(): replaces values
#map() creates a new column
#rename() : replaces the index values

In [6]:
#Replacing values via mapping
frame = pd.DataFrame({"item":["ball","mug","pen","pencil","ashtray"],
                     "color":["white","rosso","verde","black","yellow"],
                     "price":[5.56,4.2,1.3,0.56,2.75]})
print(frame,"\n")
#want to change the value of some of the colors
newcolors = {
    "rosso":"red",
    "verde" :"green"
}

print(frame.replace(newcolors),"\n")

    color     item  price
0   white     ball   5.56
1   rosso      mug   4.20
2   verde      pen   1.30
3   black   pencil   0.56
4  yellow  ashtray   2.75 

    color     item  price
0   white     ball   5.56
1     red      mug   4.20
2   green      pen   1.30
3   black   pencil   0.56
4  yellow  ashtray   2.75 



In [7]:
#can replace entries as well. e.g replace NaN with 0
ser = pd.Series([1,3,np.nan,4,6,np.nan,3])
print(ser,"\n")
print(ser.replace(np.nan,0),"\n")

0    1.0
1    3.0
2    NaN
3    4.0
4    6.0
5    NaN
6    3.0
dtype: float64 

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64 



In [8]:
#Adding values via mapping (instead of substituion)
frame5 = pd.DataFrame({"item":["ball","mug","pen","pencil","ashtray"],
                     "color":["white","rosso","verde","black","yellow"]})
print(frame)
#now you want ot add the price list column that is available
price = {
    "ball":5.56,
    "mug":4.2,
    "bottle":1.3,
    "Scissors":3.41,
    "pen":1.3,
    "pencil":0.56,
    "ashtray":2.75
}

frame["price"] = frame['item'].map(price) # add price as frame["price"]
        # by mapping the names to frame["item"] and adding prices dict

print(frame)

    color     item  price
0   white     ball   5.56
1   rosso      mug   4.20
2   verde      pen   1.30
3   black   pencil   0.56
4  yellow  ashtray   2.75
    color     item  price
0   white     ball   5.56
1   rosso      mug   4.20
2   verde      pen   1.30
3   black   pencil   0.56
4  yellow  ashtray   2.75


In [9]:
#Rename the index of the axes
reindex = {
    0: "first",
    1: "second",
    2: "third",
    3: "fourth",
    4: "fifth"
}
print(frame,"\n")
frame.rename(reindex) #indexes are renamed

    color     item  price
0   white     ball   5.56
1   rosso      mug   4.20
2   verde      pen   1.30
3   black   pencil   0.56
4  yellow  ashtray   2.75 



Unnamed: 0,color,item,price
first,white,ball,5.56
second,rosso,mug,4.2
third,verde,pen,1.3
fourth,black,pencil,0.56
fifth,yellow,ashtray,2.75


In [10]:
recolumn = {"item":"object",
           "price": "value"}
frame.rename(index = reindex, columns = recolumn) #replace column and index

Unnamed: 0,color,object,value
first,white,ball,5.56
second,rosso,mug,4.2
third,verde,pen,1.3
fourth,black,pencil,0.56
fifth,yellow,ashtray,2.75


In [11]:
frame.rename(index ={1:"kkkkk"},columns={"item":"jjjjj"})

Unnamed: 0,color,jjjjj,price
0,white,ball,5.56
kkkkk,rosso,mug,4.2
2,verde,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [13]:
#replace does not change the original frame
frame.rename(columns ={"item":"object"}, inplace = True) # changes the 
                                                         #object
frame

Unnamed: 0,color,object,price
0,white,ball,5.56
1,rosso,mug,4.2
2,verde,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [21]:
#Discretization and Binning
results =[12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]
len(results)
print()
bins =[0,25,50,75,100] #declare the bins 
cat = pd.cut(results,bins)  #partition the data into bins
cat #results are objects that contain levels array indicating names and
    #labels array containing list of values
    
#print(cat.levels)  #deprecated
print(cat.categories,"\n")

#print(cat.labels) #deprecated
print(cat.codes,"\n")

pd.value_counts(cat) #histogram


Index(['(0, 25]', '(25, 50]', '(50, 75]', '(75, 100]'], dtype='object') 

[0 1 2 2 1 3 3 0 0 2 2 1 3 0 1 3 3] 



(75, 100]    5
(50, 75]     4
(25, 50]     4
(0, 25]      4
dtype: int64

In [27]:
#another value for bins
bin_names=["unlikely","less_likely","likely","highly likely"]
        #bin levels are 1 less than values in bin
pd.cut(results, bins, labels= bin_names)

[unlikely, less_likely, likely, likely, less_likely, ..., highly likely, unlikely, less_likely, highly likely, highly likely]
Length: 17
Categories (4, object): [unlikely < less_likely < likely < highly likely]

In [31]:
cat = pd.cut(results, 5) #split the range into 5 levels
print(cat.categories)
pd.value_counts(cat)

Index(['(2.904, 22.2]', '(22.2, 41.4]', '(41.4, 60.6]', '(60.6, 79.8]',
       '(79.8, 99]'],
      dtype='object')


(79.8, 99]       5
(41.4, 60.6]     4
(22.2, 41.4]     3
(2.904, 22.2]    3
(60.6, 79.8]     2
dtype: int64

In [33]:
#qcut divides into quintiles
quintiles = pd.qcut(results, 5)
print(quintiles,"\n")
print(pd.value_counts(quintiles)) ## of values in each range should be 
                                    #similar

[[3, 24], (24, 46], (62.6, 87], (46, 62.6], (24, 46], ..., (62.6, 87], [3, 24], (46, 62.6], (87, 99], (62.6, 87]]
Length: 17
Categories (5, object): [[3, 24] < (24, 46] < (46, 62.6] < (62.6, 87] < (87, 99]] 

(62.6, 87]    4
[3, 24]       4
(87, 99]      3
(46, 62.6]    3
(24, 46]      3
dtype: int64


In [34]:
#Detecting and Filtering Outliers
randframe = pd.DataFrame(np.random.rand(1000,3))
randframe.describe() #statistics of the columns

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,0.495291,0.494692,0.506694
std,0.283156,0.293646,0.285321
min,0.000416,0.000182,0.000324
25%,0.259428,0.23533,0.260602
50%,0.50115,0.49107,0.513286
75%,0.735503,0.741351,0.751164
max,0.998478,0.99906,0.998569


In [36]:
#apply filtering to remove values more than 3 std deviatons away
(np.abs(randframe) > 3* randframe.std()).head() #show 5 entries 
            #such that entries in randomframe are more than 3 std dev away

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [39]:
randframe[np.abs(randframe) > 3* randframe.std()].head(10)

Unnamed: 0,0,1,2
0,,,
1,,,
2,,,
3,,,
4,,,
5,,0.959352,
6,,,
7,0.965613,,
8,,,
9,,,


In [47]:
randframe[(np.abs(randframe) > 3*(randframe.std())).any(1)].head()
#not sure whats supposed to happen here

Unnamed: 0,0,1,2
5,0.022365,0.959352,0.012346
7,0.965613,0.073339,0.078173
11,0.991071,0.806041,0.823682
12,0.04224,0.936733,0.517043
18,0.811393,0.926609,0.9928


In [53]:
#Permutation
#randome reordering of series and dataframe

nframe = pd.DataFrame(np.arange(25).reshape(5,5))
print(nframe,"\n")

new_order = np.random.permutation(5)#permutation of 5 element array
print(new_order, "\n")

print(nframe.take(new_order),"\n") #note that rows numbers have changed to match
                # the order specified by new_order
    
new_order = [3,4,2] # 0,1 are not considered
nframe.take(new_order)

    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24 

[3 2 0 4 1] 

    0   1   2   3   4
3  15  16  17  18  19
2  10  11  12  13  14
0   0   1   2   3   4
4  20  21  22  23  24
1   5   6   7   8   9 



Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14
