<a href="https://colab.research.google.com/github/a-bily/udemy-ml-bootcamp/blob/main/Module_1_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Importing libraries

In [3]:
import pandas as pd
import numpy as np
import sklearn

2. Generating data - cats characteristics

In [4]:

data = {
    'size': ['small', 'large', 'medium', 'large', 'medium'],
    'color': ['brown', 'black', 'white', 'black', 'brown'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'age': [1, 12, 6, 7, 11],
    'weight': [5, 10, 8, 7, 10],
    'adopted': ['yes', 'no', 'yes', 'no', 'yes']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,age,weight,adopted
0,small,brown,female,1,5,yes
1,large,black,male,12,10,no
2,medium,white,male,6,8,yes
3,large,black,female,7,7,no
4,medium,brown,female,11,10,yes


3. Creating the copy of the data

In [5]:
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   size     5 non-null      object
 1   color    5 non-null      object
 2   gender   5 non-null      object
 3   age      5 non-null      int64 
 4   weight   5 non-null      int64 
 5   adopted  5 non-null      object
dtypes: int64(2), object(4)
memory usage: 368.0+ bytes


4. Changing data type

In [6]:
for col in ['size', 'color', 'gender', 'adopted']:
  df[col] = df[col].astype('category')


In [7]:
df['age'] = df["age"].astype(float)
df['weight'] = df['weight'].astype(float)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   size     5 non-null      category
 1   color    5 non-null      category
 2   gender   5 non-null      category
 3   age      5 non-null      float64 
 4   weight   5 non-null      float64 
 5   adopted  5 non-null      category
dtypes: category(4), float64(2)
memory usage: 628.0 bytes


In [9]:
df.describe()

Unnamed: 0,age,weight
count,5.0,5.0
mean,7.4,8.0
std,4.393177,2.12132
min,1.0,5.0
25%,6.0,7.0
50%,7.0,8.0
75%,11.0,10.0
max,12.0,10.0


In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,5.0,7.4,4.393177,1.0,6.0,7.0,11.0,12.0
weight,5.0,8.0,2.12132,5.0,7.0,8.0,10.0,10.0


In [11]:
df.describe(include='category').T

Unnamed: 0,count,unique,top,freq
size,5,3,medium,2
color,5,3,brown,2
gender,5,2,female,3
adopted,5,2,yes,3


In [12]:
df

Unnamed: 0,size,color,gender,age,weight,adopted
0,small,brown,female,1.0,5.0,yes
1,large,black,male,12.0,10.0,no
2,medium,white,male,6.0,8.0,yes
3,large,black,female,7.0,7.0,no
4,medium,brown,female,11.0,10.0,yes


5. Label Encoder

In [13]:
from sklearn.preprocessing import LabelEncoder

l_e = LabelEncoder()
l_e.fit(df['adopted'])
l_e.transform(df['adopted'])

# or l_e.fit_transform(df['adopted'])

array([1, 0, 1, 0, 1])

In [14]:
l_e.classes_

array(['no', 'yes'], dtype=object)

In [15]:
df['adopted'] = l_e.fit_transform(df['adopted'])

In [16]:
df

Unnamed: 0,size,color,gender,age,weight,adopted
0,small,brown,female,1.0,5.0,1
1,large,black,male,12.0,10.0,0
2,medium,white,male,6.0,8.0,1
3,large,black,female,7.0,7.0,0
4,medium,brown,female,11.0,10.0,1


In [17]:
# to reverse LabelEncoder transformation --> l_e.inverse_transform(df['adopted'])

6. OneHotEncoder

In [18]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
encoder.fit(df[["age"]])
encoder.transform(df[['age']])

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [19]:
encoder.categories_

[array([ 1.,  6.,  7., 11., 12.])]

7. Pandas get_dummies

In [20]:
pd.get_dummies(data=df)

Unnamed: 0,age,weight,adopted,size_large,size_medium,size_small,color_black,color_brown,color_white,gender_female,gender_male
0,1.0,5.0,1,0,0,1,0,1,0,1,0
1,12.0,10.0,0,1,0,0,1,0,0,0,1
2,6.0,8.0,1,0,1,0,0,0,1,0,1
3,7.0,7.0,0,1,0,0,1,0,0,1,0
4,11.0,10.0,1,0,1,0,0,1,0,1,0


In [21]:
pd.get_dummies(data=df, drop_first=True, prefix_sep='-')

Unnamed: 0,age,weight,adopted,size-medium,size-small,color-brown,color-white,gender-male
0,1.0,5.0,1,0,1,1,0,0
1,12.0,10.0,0,0,0,0,0,1
2,6.0,8.0,1,1,0,0,1,1
3,7.0,7.0,0,0,0,0,0,0
4,11.0,10.0,1,1,0,1,0,0


In [22]:
pd.get_dummies(data=df, drop_first=True, columns=['gender'])

Unnamed: 0,size,color,age,weight,adopted,gender_male
0,small,brown,1.0,5.0,1,0
1,large,black,12.0,10.0,0,1
2,medium,white,6.0,8.0,1,1
3,large,black,7.0,7.0,0,0
4,medium,brown,11.0,10.0,1,0


7. Standardization

std() - pandas nieobciążony

---


std() - numpy obciążony

In [27]:
print(f"{df['age']}\n")
print(f"Średnia: {df['age'].mean()}")
print(f"Odchylenie standardowe: {df['age'].std():.4f}")

0     1.0
1    12.0
2     6.0
3     7.0
4    11.0
Name: age, dtype: float64

Średnia: 7.4
Odchylenie standardowe: 4.3932


In [28]:
( df['age'] - df['age'].mean() ) / df['age'].std()

0   -1.456805
1    1.047078
2   -0.318676
3   -0.091050
4    0.819453
Name: age, dtype: float64

In [34]:
def standarize(x):
  return (x - x.mean()) / x.std()

standarize(df['age'])

0   -1.456805
1    1.047078
2   -0.318676
3   -0.091050
4    0.819453
Name: age, dtype: float64

In [35]:
from sklearn.preprocessing import scale

scale(df['age'])

array([-1.62875712,  1.17066918, -0.35629062, -0.10179732,  0.91617588])

In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[['age']])
scaler.transform(df[['age']])

array([[-1.62875712],
       [ 1.17066918],
       [-0.35629062],
       [-0.10179732],
       [ 0.91617588]])

In [39]:
scaler = StandardScaler()
df[['age', 'weight']] = scaler.fit_transform(df[['age', 'weight']])
df

Unnamed: 0,size,color,gender,age,weight,adopted
0,small,brown,female,-1.628757,-1.581139,1
1,large,black,male,1.170669,1.054093,0
2,medium,white,male,-0.356291,0.0,1
3,large,black,female,-0.101797,-0.527046,0
4,medium,brown,female,0.916176,1.054093,1


8. Data preparation

In [41]:
df = pd.get_dummies(data=df, drop_first=True)
df

Unnamed: 0,age,weight,adopted,size_medium,size_small,color_brown,color_white,gender_male
0,-1.628757,-1.581139,1,0,1,1,0,0
1,1.170669,1.054093,0,0,0,0,0,1
2,-0.356291,0.0,1,1,0,0,1,1
3,-0.101797,-0.527046,0,0,0,0,0,0
4,0.916176,1.054093,1,1,0,1,0,0
