In [1]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

from __future__ import print_function

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [None]:
#ZMIENNE CIĄGŁE NA POCZĄTEK

In [6]:
numerical_features = ['age', 'fare'] 
# zobaczmy czy mamy jakies braki w tych danych

data[numerical_features].isna().sum()

age     263
fare      1
dtype: int64

In [None]:
SimpleImputer(strategy='median')  # będziemy zastępowali braki wartością mediany

In [9]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
#stworzyliśmy pipeline

In [None]:
# Zmienne kategoryczne

In [10]:
categorical_features = ['embarked', 'sex', 'pclass']
data[categorical_features].isna().sum()

embarked    2
sex         0
pclass      0
dtype: int64

In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop=None, handle_unknown='ignore'))
])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)
# wszystkie pozostałe kolumny usuwanmy

In [None]:
#budujemy ostatecznie pilpeline

In [17]:
clf = Pipeline(steps=[
    ('prep', preprocessor),
    ('clf_lr', LogisticRegression())
])

In [18]:
clf.steps

[('prep',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='median')),
                                                   ('scaler', StandardScaler())]),
                                   ['age', 'fare']),
                                  ('cat',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('encoder',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   ['embarked', 'sex', 'pclass'])])),
 ('clf_lr', LogisticRegression())]

In [19]:
X = data.drop('survived', axis=1)
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [20]:
y = data['survived']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [22]:
X_train.shape

(916, 13)

In [23]:
X_train.shape

(916, 13)

In [25]:
clf.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['embarked', 'sex',
   

In [26]:
clf.score(X_test, y_test)

0.7913486005089059

In [27]:
clf.score(X_train, y_train)

0.7827510917030568

# PRZYPOMNIENIE Z PROJEKTOWANIA OBIEKTOWEGO

### Zadanie 0

Zaimplementuj MyStandardScaler

In [None]:
?StandardScaler

In [None]:
# ZAIMPLEMENTOWAC STANDARDSCAER KTORY BEDZIE ZWRACAL DATA FRAME 

In [121]:
from sklearn.base import BaseEstimator, TransformerMixin

In [103]:
class MyStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True, with_std=True):
      
    #pola przechowujace ustawienia scalera
        self._with_mean = with_mean
        self._with_std = with_std
        pass
    
    #pola przechowujace statystyki kolumn
        self.means = None
        self.stds = None
        
    def fit(self, X, y=None): #X, Y są datafremami
        if self.with_mean:
            self._means = X.mean(axis=0).values
        if self.with_std:
            self._std = X.std(axis=0).values
        return self
    def transform(self, X):
        X_transformed = X.copy()
        
        if self.with_mean:
            X_transformed = X_transformed - self._means
            
        if self.with_std:
            X_transformed = X_transformed / self._stds
        #wykonywanie faktycznego skalowania w oparciu o "wyuczone" means i std
        return X_transformed 

In [104]:
scaler = MyStandardScaler()

In [105]:
scaler._with_means

AttributeError: 'MyStandardScaler' object has no attribute '_with_means'

In [101]:
scaler.fit(X)

MyStandardScaler(with_mean=None, with_std=None)

In [86]:
scaler.fit_transform(X)

AttributeError: 'MyStandardScaler' object has no attribute '_means'

In [36]:
df = [1, 2, 3, 4, 5]

In [38]:
df.fit(df)

AttributeError: 'list' object has no attribute 'fit'

In [122]:
scaler = MyStandardScaler

In [123]:
class MyStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True, with_std=True):
      
    #pola przechowujace ustawienia scalera
        self._with_mean = with_mean
        self._with_std = with_std
        pass
    
    #pola przechowujace statystyki kolumn
        self.means = None
        self.stds = None
        
    def fit(self, X, y=None): #X, Y są datafremami
        if self.with_mean:
            self._means = X.mean(axis=0).values
        if self.with_std:
            self._stds = X.std(axis=0).values
        return self
        
    def transform(self, X):
        X_transformed = X.copy()
        
        if self.with_mean:
            X_transformed = X_transformed - self._means
            
        if self.with_std:
            X_transformed = X_transformed / self._stds
        #wykonywanie faktycznego skalowania w oparciu o "wyuczone" means i std
        return X_transformed 

In [124]:
scaler = MyStandardScaler()

In [125]:
X = data.drop('survived', axis=1)
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [126]:
X_train, X_test = train_test_split(X, test_size=0.3)

In [127]:
X_train.shape

(916, 13)

In [128]:
X_test.shape

(393, 13)

ValueError: y should be a 1d array, got an array of shape (393, 13) instead.

piszemy funkcję dzielenia

In [133]:

def divide(a, b):
    if a==0:
        return 0 
    
    if b==0:
        return -1
    
    else:
        return a/ b
    

In [137]:
import unittest
from unittest import TestCase

In [144]:
class TestDivide(TestCase): # klasa musi nazywać się Test<Cośtam>
    def test_div_0(self): # metoda musi nazywac sie test<cośtam>
        a = 3
        b = 0
        result = divide(a, b)
        
        self.assertEqual(result, -1, "Wrong answer, should be -1")
    

In [139]:
unittest.main(argv=[' '], verbosity=2, exit=False)

test_div_0 (__main__.TestDivide) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.002s

OK


<unittest.main.TestProgram at 0x1dbef059610>

### Zadanie 1

Zaimplementuj transformer, który usuwa wybrane kolumny.

In [None]:
class DropColumns(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified columns.
    """
    
    def __init__(self):    
        ..
            
    def fit(self, X, y=None):
        ...
        return self

    def transform(self, X):
        ...

### Zadanie 2

Zaimplementuj transformator, który wybiera z danych kolumny określonego typu.
* argument `column_type` - typ lub lista typów, które chcemy uwzględnić
* użyj metody pandasowej ramki danych `select_dtypes`

In [None]:
class ColumnsSelectorByType(BaseEstimator, TransformerMixin):
    """
    Transformer to select columns of specified types.
    """
    
    def __init__(self):    
        ...
            
    def fit(self, X, y=None):
        ...
        return self

    def transform(self, X):
        ...

### Zadanie 3

Zaimplementuj transormer, który zamienia zmienne, w których ponad `treshold` procent obserwacji zawiera brak danych, na zmienne binarne z wartościami 1, tam gdzie jest dana wartość oraz 0 tam, gdzie występuje brak.

### Zadanie 4

Zaimplementuj transformator `ReduceRareValues`, który redukuje zbiór wartości zmiennych nominalnych poprzez zastępowanie wartości występujących w mniej niż `treshold` obserwacji wartością `replace_value`, domyślnie równą `"rare_value"`.

### Zadanie 5 

Z zaimplementowanych transformatorów, skonstruuj pipeline do przetworzenia danych titanic od surowego zbioru do zbioru gotowego do modelowania i przetestuje model regresji logistycznej z domyślnymi parametrami. Pipeline ma przebiegać następująco:
1. Usuń kolumny: `body, boat, name, ticket, cabin, embarked, home.dest`
2. Podziel zbiór na zmienne numeryczne i kategoryczne - połącz oba po osobnym przetworzeniu. Użyć FeatureUnion

3a. Zmienne numeryczne - uzupełnij braki danych średnią

3b. Zmienne kategoryczne:
    - zmienne z brakami w ponad 50% obserwacji zamiań na zmienne binarne
    - uzupełnij braki danych wartością `missing_value`
    - zredukuj wartosci wystepujące w co najwyżej 20 obserwacjach
    - zakoduj te zmienne kodowaniem one-hot, zwracając macierz gęstą