In [1]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

from __future__ import print_function

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


Zmienne cigłe

In [4]:
numerical_features = ['age', 'fare']

In [5]:
data[numerical_features].isna().sum()

age     263
fare      1
dtype: int64

In [6]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

Zmienne kategoryczne

In [7]:
categorical_features = ['embarked', 'sex', 'pclass']
data[categorical_features].isna().sum()

embarked    2
sex         0
pclass      0
dtype: int64

In [8]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop=None, handle_unknown='ignore'))
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' # wszystkie pozostałe kolumny usuwamy
)

Budujemy ostateczny pipeline

In [10]:
clf = Pipeline(steps=[
    ('prep', preprocessor),
    ('clf_lr', LogisticRegression())
])

In [11]:
clf.steps

[('prep',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='median')),
                                                   ('scaler', StandardScaler())]),
                                   ['age', 'fare']),
                                  ('cat',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('encoder',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   ['embarked', 'sex', 'pclass'])])),
 ('clf_lr', LogisticRegression())]

In [12]:
X = data.drop('survived', axis=1)
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [13]:
y = data['survived']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [15]:
X_train.shape

(916, 13)

In [16]:
X_test.shape

(393, 13)

In [17]:
clf.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['embarked', 'sex',
   

In [18]:
clf.score(X_test, y_test)

0.7913486005089059

In [19]:
clf.score(X_train, y_train)

0.7827510917030568

## Przypomnienie z OOP

In [20]:
class Rectangle:
    def __init__(self, width, height):
        self._width = width
        self._height = height
    
    def calculate_area(self):
        return self._width * self._height
    
    def calculate_perimeter(self):
        return 2 * (self._width + self._height)
    
    def get_width(self):
        return self._width
    
    def get_height(self):
        return self._height
    
    def hello(self):
        print('Hello rectangle')

In [21]:
rect = Rectangle(width = 3, height = 2)

In [22]:
rect.calculate_area()

6

In [23]:
rect.get_width()

3

In [24]:
rect._width = 5 # nigdy tak nie robimy - zamiats tego tworzymy gettery i settery

In [25]:
rect.calculate_area()

10

In [26]:
class Square:
    def __init__(self, width):
        self._width = width
    
    def calculate_area(self):
        return self._width * self._width
    
    def calculate_perimeter(self):
        return 2 * (self._width + self._width)
    
    def get_width(self):
        return self._width
    
    def get_height(self):
        return self._width

In [27]:
sq = Square(width=4)

In [28]:
sq.calculate_area()

16

In [29]:
class Square(Rectangle):
    def __init__(self, width):
        super().__init__(width=width, height=width)
        
    def hello(self):
        print('Hi square!')

In [30]:
sq2 = Square(width=5)

In [31]:
sq2.get_height()

5

In [32]:
sq2.calculate_area()

25

In [33]:
sq2.hello()

Hi square!


### Zadanie 0

Zaimplementuj MyStandardScaler

In [34]:
?StandardScaler

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

In [36]:
class MyStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True, with_std=True):
        
        # pola przechowujące ustawienia scalera
        self.with_mean = with_mean
        self.with_std = with_std
        
        # pola przechowujące statystyki kolumn
        self._means = None
        self._stds = None
        
    
    def fit(self, X, y=None): # X, y są datafremami
        if self.with_mean:
            self._means = X.mean(axis=0).values
        if self.with_std:
            self._stds = X.std(axis=0).values
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        
        if self.with_mean:
            X_transformed = X_transformed - self._means
        
        if self.with_std:
            X_transformed = X_transformed / self._stds
        
        return X_transformed

### Testy klasy

#### 1. with_mean=True, with_std=True

In [37]:
scaler = MyStandardScaler(with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X_train[['age', 'fare']]) # sprawdzamy, czy dobrze działa dziedzieczenie

In [38]:
X_scaled.mean()

age     2.247415e-18
fare    5.018426e-16
dtype: float64

In [39]:
X_scaled.std()

age     1.0
fare    1.0
dtype: float64

#### 2. with_mean=False, with_std=True

In [40]:
scaler = MyStandardScaler(with_mean=False, with_std=True)
X_scaled = scaler.fit_transform(X_train[['age', 'fare']]) # sprawdzamy, czy dobrze działa dziedzieczenie
X_scaled.mean()

age     2.169580
fare    0.686091
dtype: float64

In [41]:
X_scaled.std()

age     1.0
fare    1.0
dtype: float64

#### 3. with_mean=True, with_std=False

In [42]:
scaler = MyStandardScaler(with_mean=True, with_std=False)
X_scaled = scaler.fit_transform(X_train[['age', 'fare']]) # sprawdzamy, czy dobrze działa dziedzieczenie
X_scaled.mean()

age     3.279428e-15
fare    2.397694e-14
dtype: float64

In [43]:
X_scaled.std()

age     13.934240
fare    47.416524
dtype: float64

In [44]:
scaler.get_params() # efekt dziedziczenia po BaseEstimator

{'with_mean': True, 'with_std': False}

In [45]:
def divide(a, b):
    if a == 0:
        return 0
    
    if b == 0:
        return -1
    
    else:
        return a / b

In [46]:
import unittest
from unittest import TestCase

In [47]:
class TestDivide(TestCase): # Klasa musi nazywać się Test<Cośtam>
    def setUp(self):
        self.a = 3
    
    def test_div_0(self):  # Metoda musi nazywać się test_<cośtam>
        b = 0
        result = divide(self.a, b)
        
        self.assertEqual(result, -1, "Wrong answer, should be -1")
        
    def test_div_normal(self):  # Metoda musi nazywać się test_<cośtam>
        b = 2
        result = divide(self.a, b)
        
        self.assertEqual(result, 1.5, "Wrong answer, should be 1.5")

In [48]:
class TestMyStandardScaler(TestCase):
    def setUp(self):
        self.dataset = pd.DataFrame({'value': [1, 1, 1, 1]})
    
    def test_scale_with_mean_without_std(self):
        scaler = MyStandardScaler(with_mean=True, with_std=False)
        X_scaled = scaler.fit_transform(self.dataset)

        self.assertEqual(X_scaled['value'].values.tolist(),
                        [0, 0, 0, 0],
                        "Wrong scalling")
        
    def test_scale_with_mean_with_std(self):
        dataset = pd.DataFrame({'value': [-1, -1, 1, 1]})
        scaler = MyStandardScaler(with_mean=True, with_std=True)
        X_scaled = scaler.fit_transform(dataset)

        self.assertEqual(X_scaled['value'].values.tolist(),
                        [0, 0, 0, 0], # dobrać wartości 
                        "Wrong scalling")

### Zadanie 1

Zaimplementuj transformer, który usuwa wybrane kolumny.

In [49]:
class DropColumns(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified columns.
    """
    
    def __init__(self, column_names):    
        self.columns_to_drop = column_names
            
    def fit(self, X, y=None):# nieużywana metoda
        return self

    def transform(self, X):
        return X.drop(labels=self.columns_to_drop,
                     axis=1,
                     inplace=False)

In [50]:
class TestDropColumns(TestCase):
    def setUp(self):
        self.dataset = pd.DataFrame({
            'a': [1, 1, 1, 1],
            'b': [1, 1, 1, 1],
            'c': [1, 1, 1, 1],
            'd': [1, 1, 1, 1]
        })
    
    def test_dropping_existing_columns(self):
        dropper = DropColumns(column_names=['a', 'b'])
        X_dropped = dropper.fit_transform(self.dataset)
        self.assertEqual(list(X_dropped.columns),
                        ['c', 'd'],
                        'Issue with dropping existing columns')
        
    def test_dropping_nonexisting_columns(self):
        dropper = DropColumns(column_names=['a', 'b', 'zzzzzzz'])
        error = None
        try:
            X_dropped = dropper.fit_transform(self.dataset)
        except KeyError:
            error = 'key'
        self.assertEqual(error, 'key', 'Wrong error with nonexisting columns')

In [51]:
dropper = DropColumns(column_names=['pclass', 'fare', 'name'])
dropper.fit_transform(X_train).head()

Unnamed: 0,sex,age,sibsp,parch,ticket,cabin,embarked,boat,body,home.dest
501,female,13.0,0,1,250644,,S,14.0,,"England / Bennington, VT"
588,female,4.0,1,1,29103,,S,14.0,,"Cornwall / Akron, OH"
402,female,30.0,1,0,SC/PARIS 2148,,C,12.0,,"Barcelona, Spain / Havana, Cuba"
1193,male,,0,0,36209,,Q,,,
686,female,22.0,0,0,334914,,Q,13.0,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."


In [52]:
# fajnie, ale co gdy podamy kolumnę, która nie istnieje?
dropper = DropColumns(column_names=['pclass', 'fare', 'name', 'saffd'])
dropper.fit_transform(X_train).head()

KeyError: "['saffd'] not found in axis"

In [None]:
class DropColumnsFancy(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified columns.
    """
    
    def __init__(self, column_names):    
        self.columns_to_drop = column_names
            
    def fit(self, X, y=None):# nieużywana metoda
        return self

    def transform(self, X):
        available_columns = set(X.columns).intersection(self.columns_to_drop)
        return X.drop(labels=available_columns,
                     axis=1,
                     inplace=False)

In [None]:
dropper = DropColumnsFancy(column_names=['pclass', 'fare', 'name', 'saffd'])
dropper.fit_transform(X_train).head()

In [None]:
class TestDropColumnsFancy(TestCase):
    def setUp(self):
        self.dataset = pd.DataFrame({
            'a': [1, 1, 1, 1],
            'b': [1, 1, 1, 1],
            'c': [1, 1, 1, 1],
            'd': [1, 1, 1, 1]
        })
    
    def test_dropping_existing_columns(self):
        dropper = DropColumnsFancy(column_names=['a', 'b'])
        X_dropped = dropper.fit_transform(self.dataset)
        self.assertEqual(list(X_dropped.columns),
                        ['c', 'd'],
                        'Issue with dropping existing columns')
        
    def test_dropping_nonexisting_columns(self):
        dropper = DropColumnsFancy(column_names=['a', 'b', 'zzzzzzz'])
        X_dropped = dropper.fit_transform(self.dataset)
        self.assertEqual(list(X_dropped.columns),
                        ['c', 'd'],
                        'Issue with dropping nonexisting columns')

### Zadanie 2

Zaimplementuj transformator, który wybiera z danych kolumny określonego typu.
* argument `column_types` - typ lub lista typów, które chcemy uwzględnić
* użyj metody pandasowej ramki danych `select_dtypes`

In [None]:
class ColumnsSelectorByType(BaseEstimator, TransformerMixin):
    """
    Transformer to select columns of specified types.
    """
    
    def __init__(self, column_types):    
        self.column_types = column_types
            
    def fit(self, X, y=None): # nieużywana metoda
        return self

    def transform(self, X):
        return X.select_dtypes(include=self.column_types)

In [None]:
col_selector = ColumnsSelectorByType(column_types=[np.number])
col_selector.fit_transform(X)

In [None]:
class TestColumnsSelectorByType(unittest.TestCase):
    def setUp(self):
        self.dataset = pd.DataFrame({
            'a': [1.0, 1.0, 1.0, 1.0],
            'b': [1, 1, 1, 1],
            'c': ['a', 'a', 'a', 'a'],
        })
    
    def test_select_int(self):
        col_selector = ColumnsSelectorByType([np.int64])
        
        X_selected = col_selector.fit_transform(self.dataset)
        self.assertEqual(list(X_selected.columns),
                         ['b'],
                         'Issue with selecting integer columns')
    
    def test_select_int_2(self):
        col_selector = ColumnsSelectorByType(['int'])
        
        X_selected = col_selector.fit_transform(self.dataset)
        self.assertEqual(list(X_selected.columns),
                         ['b'],
                         'Issue with selecting integer columns')
    
    def test_select_float(self):
        col_selector = ColumnsSelectorByType([float])
        
        X_selected = col_selector.fit_transform(self.dataset)
        self.assertEqual(list(X_selected.columns),
                         ['a'],
                         'Issue with selecting float columns')
        
    def test_select_string(self):
        col_selector = ColumnsSelectorByType([object])
        
        X_selected = col_selector.fit_transform(self.dataset)
        self.assertEqual(list(X_selected.columns),
                         ['c'],
                         'Issue with selecting string columns')

### Zadanie 3

Zaimplementuj transormer, który zamienia zmienne, w których ponad `treshold` procent obserwacji zawiera brak danych, na zmienne binarne z wartościami 1, tam gdzie jest dana wartość oraz 0 tam, gdzie występuje brak.

In [63]:

class MissingIndicatorForSparseFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold):
        
        if threshold > 1:
            self.threshold = threshold / 100
            
        else:
            self.threshold = threshold
        
        
        self._columns_to_transform = None
        
    def fit(self, X, y=None):
        
        column_indices = X.isnull().mean(axis=0) > self.threshold
        
        self._columns_to_transform = X.columns[column_indices]
        
        return self
    
    def transform(self, X):
        X_transformed = X
        
        X_transformed[self._columns_to_transform] = X_transformed[self._columns_to_transform].notnull().astype(int)
        
        return X_transformed

In [65]:
X.isnull().mean()

pclass       0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [66]:
sparse_indicator = MissingIndicatorForSparseFeatures(threshold = 10)
sparse_indicator.fit_transform(X)

# home i age jest zbinaryzowane 

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,1,0,0,24160,211.3375,1,S,1,0,1
1,1,"Allison, Master. Hudson Trevor",male,1,1,2,113781,151.5500,1,S,1,0,1
2,1,"Allison, Miss. Helen Loraine",female,1,1,2,113781,151.5500,1,S,0,0,1
3,1,"Allison, Mr. Hudson Joshua Creighton",male,1,1,2,113781,151.5500,1,S,0,1,1
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,1,1,2,113781,151.5500,1,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Zabour, Miss. Hileni",female,1,1,0,2665,14.4542,0,C,0,1,0
1305,3,"Zabour, Miss. Thamine",female,0,1,0,2665,14.4542,0,C,0,0,0
1306,3,"Zakarian, Mr. Mapriededer",male,1,0,0,2656,7.2250,0,C,0,1,0
1307,3,"Zakarian, Mr. Ortin",male,1,0,0,2670,7.2250,0,C,0,0,0


In [None]:
class TestMissingIndicatorForSparseFeatures(unittest.TestCase):
    def setUp(self):
        self.dataset = pd.DataFrame({
            'a': [1, 1, 1, 1, None],
            'b': [1, 1, 1, None, None],
            'c': [1, 1, None, None, None]
        })

### Zadanie 4

Zaimplementuj transformator `ReduceRareValues`, który redukuje zbiór wartości zmiennych nominalnych poprzez zastępowanie wartości występujących w mniej niż `treshold` obserwacji wartością `replace_value`, domyślnie równą `"rare_value"`.

In [None]:

class ReduceRareValues(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold, replace_value='rare_value'):
        
        self.threshold = threshold
        self.replace_value = replace_value
        
        self._classes_to_keep
    
    
    def fit(self, X, y=None):
                
        return self
    
    def transform(self, X):
        X_transformed = X
        
        X_transformed[self._columns_to_transform] = X_transformed[self._columns_to_transform].notnull().astype(int)
        
        return X_transformed

### Zadanie 5 

Z zaimplementowanych transformatorów, skonstruuj pipeline do przetworzenia danych titanic od surowego zbioru do zbioru gotowego do modelowania i przetestuje model regresji logistycznej z domyślnymi parametrami. Pipeline ma przebiegać następująco:
1. Usuń kolumny: `body, boat, name, ticket, cabin, embarked, home.dest`
2. Podziel zbiór na zmienne numeryczne i kategoryczne - połącz oba po osobnym przetworzeniu. Użyć FeatureUnion

3a. Zmienne numeryczne - uzupełnij braki danych średnią

3b. Zmienne kategoryczne:
    - zmienne z brakami w ponad 50% obserwacji zamiań na zmienne binarne
    - uzupełnij braki danych wartością `missing_value`
    - zredukuj wartosci wystepujące w co najwyżej 20 obserwacjach
    - zakoduj te zmienne kodowaniem one-hot, zwracając macierz gęstą

In [None]:
unittest.main(argv=[''], verbosity=2, exit=False)