# Data Preprocessing

In [2]:
import pandas as pd
iris_filename = "irisdata.csv"
iris = pd.read_csv(iris_filename)
iris.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [5]:
iris = pd.read_csv(iris_filename, sep=",", decimal=".", header =None,
                   names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"])
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


Masking

In [6]:
mask_feature = iris["sepal_length"] > 6.0
mask_feature

0      False
1      False
2      False
3      False
4      False
       ...  
145     True
146     True
147     True
148     True
149    False
Name: sepal_length, Length: 150, dtype: bool

Substitution
loc(): way to access the data of the matrix with the help of row-column indexes

In [9]:
mask_target = iris['target'] == "Iris-virginica"
iris.loc[mask_target, 'target'] = "New label"
iris["target"].unique()

array(['Iris-setosa', 'Iris-versicolor', 'New label'], dtype=object)

Statistics about features

In [11]:
grouped_targets_mean = iris.groupby(['target']).mean()
grouped_targets_mean

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
New label,6.588,2.974,5.552,2.026


In [15]:
#directly using agg method for each variable to apply specific functions
funcs = {"sepal_length": ['mean', "std"],
         "sepal_width": ["max", "min"],
         "petal_length": ["mean", "std"],
         "petal_width": ["max", "min"]}
grouped_targets_f = iris.groupby(['target']).agg(funcs)
grouped_targets_f

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_width,sepal_width,petal_length,petal_length,petal_width,petal_width
Unnamed: 0_level_1,mean,std,max,min,mean,std,max,min
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Iris-setosa,5.006,0.35249,4.4,2.3,1.464,0.173511,0.6,0.1
Iris-versicolor,5.936,0.516171,3.4,2.0,4.26,0.469911,1.8,1.0
New label,6.588,0.63588,3.8,2.2,5.552,0.551895,2.5,1.4


In [22]:
#sorting the observations using the function
iris.sort_values(by="sepal_length").head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
13,4.3,3.0,1.1,0.1,Iris-setosa
42,4.4,3.2,1.3,0.2,Iris-setosa
38,4.4,3.0,1.3,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
41,4.5,2.3,1.3,0.3,Iris-setosa


apply() pandas method to perform any row-wise or column-wise operation progammatically

In [24]:
import numpy as np
iris.apply(np.count_nonzero, axis=1).head()

0    5
1    5
2    5
3    5
4    5
dtype: int64

In [25]:
iris.apply(np.count_nonzero, axis=0)

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
target          150
dtype: int64

In [26]:
#length of the string representation of each cell
iris.applymap(lambda x:len(str(x))).head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,3,3,3,3,11
1,3,3,3,3,11
2,3,3,3,3,11
3,3,3,3,3,11
4,3,3,3,3,11


In [31]:
def square(x):
    return x**2
original_variables = ['sepal_length', 'sepal_width', 
                          'petal_length', 'petal_width']
squared_iris = iris[original_variables].apply(square)

In [None]:
import multiprocessing
def apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def parallel_apply(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    df_split = np.array_split(df, workers)
    results = pool.map(apply_df, [(ds, func, kwargs)
                                       for ds in df_split])
    pool.close()
    return pd.concat(list(results))
    
squared_iris = parallel_apply(iris[['sepal_length', 'sepal_width', 
                                        'petal_length', 'petal_width']], 
                                        func=square, 
                                        axis=1, 
                                        workers=4)
squared_iris