In [587]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import StandardScaler

In [588]:
df = sns.load_dataset("titanic")

In [589]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [590]:
df[df.isnull().sum()[df.isnull().sum()!=0].index.tolist()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   age          714 non-null    float64 
 1   embarked     889 non-null    object  
 2   deck         203 non-null    category
 3   embark_town  889 non-null    object  
dtypes: category(1), float64(1), object(2)
memory usage: 22.2+ KB


In [591]:
X = df.drop("survived",axis=1)
Y = df["survived"]

### Step 1 : train_test_split

In [592]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=1)

In [593]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
114,3,female,17.0,0,0,14.4583,C,Third,woman,False,,Cherbourg,no,True
874,2,female,28.0,1,0,24.0,C,Second,woman,False,,Cherbourg,yes,False
76,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
876,3,male,20.0,0,0,9.8458,S,Third,man,True,,Southampton,no,True
674,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True


In [594]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(623, 14)
(268, 14)
(623,)
(268,)


In [595]:
si = SimpleImputer()
X_train_age = si.fit_transform(X_train[["age"]])
#X_train_age = pd.DataFrame(data=X_train_age,columns=["age_imputed"])

In [596]:
X_train["age"] = X_train_age

In [597]:
X_test_age = si.transform(X_test[["age"]])

In [598]:
X_test["age"]=X_test_age

In [599]:
si_cat = SimpleImputer(strategy='most_frequent')
X_train_cat = si_cat.fit_transform(X_train[["embarked","deck","embark_town"]])
print(X_train_cat.shape)

(623, 3)


In [600]:
X_train[["embarked","deck","embark_town"]] = X_train_cat

In [601]:
X_test_cat = si_cat.transform(X_test[["embarked","deck","embark_town"]])

In [602]:
print(X_test_cat.shape)

(268, 3)


In [603]:
X_test[["embarked","deck","embark_town"]] = X_test_cat

In [604]:
X_train.isnull().sum()

pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [605]:
X_train.columns

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class',
       'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'],
      dtype='object')

#### One hot encoding

In [606]:
from sklearn.preprocessing import OneHotEncoder

In [607]:
cat_col = X.select_dtypes(exclude=['int','float']).columns.tolist()
cat_col

['sex',
 'embarked',
 'class',
 'who',
 'adult_male',
 'deck',
 'embark_town',
 'alive',
 'alone']

In [608]:
df[cat_col].head(2)

Unnamed: 0,sex,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,male,S,Third,man,True,,Southampton,no,False
1,female,C,First,woman,False,C,Cherbourg,yes,False


In [609]:
ohe = OneHotEncoder(sparse=False)

In [610]:
ohe_cat_col = ohe.fit_transform(X_train[cat_col])

In [611]:
ohe_cat_col.shape

(623, 27)

In [612]:
ohe.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S'], dtype=object),
 array(['First', 'Second', 'Third'], dtype=object),
 array(['child', 'man', 'woman'], dtype=object),
 array([False,  True]),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype=object),
 array(['Cherbourg', 'Queenstown', 'Southampton'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array([False,  True])]

In [613]:
categories=[m for i in range(len(ohe.categories_)) for m in list(ohe.categories_[i]) ]
categories

['female',
 'male',
 'C',
 'Q',
 'S',
 'First',
 'Second',
 'Third',
 'child',
 'man',
 'woman',
 False,
 True,
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'Cherbourg',
 'Queenstown',
 'Southampton',
 'no',
 'yes',
 False,
 True]

In [614]:
ohe_cat_col[:,:2]

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [615]:
df1 = pd.DataFrame(data=ohe_cat_col,columns=categories)

In [616]:
df1

Unnamed: 0,female,male,C,Q,S,First,Second,Third,child,man,...,E,F,G,Cherbourg,Queenstown,Southampton,no,yes,False,True
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
619,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
620,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
621,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [617]:
X_train1 = pd.get_dummies(data=X_train,columns=cat_col)

In [618]:
X_train1.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,...,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes,alone_False,alone_True
114,3,17.0,0,0,14.4583,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
874,2,28.0,1,0,24.0,1,0,1,0,0,...,0,0,0,1,0,0,0,1,1,0
76,3,30.082661,0,0,7.8958,0,1,0,0,1,...,0,0,0,0,0,1,1,0,0,1
876,3,20.0,0,0,9.8458,0,1,0,0,1,...,0,0,0,0,0,1,1,0,0,1
674,2,30.082661,0,0,0.0,0,1,0,0,1,...,0,0,0,0,0,1,1,0,0,1


In [619]:
X_test1 = pd.get_dummies(data=X_test,columns=cat_col)

In [620]:
print(X_train1.shape)
print(X_test1.shape)
print(Y_train.shape)
print(Y_test.shape)

(623, 32)
(268, 32)
(623,)
(268,)


In [621]:
from sklearn.tree import DecisionTreeClassifier

In [622]:
dt = DecisionTreeClassifier()

In [623]:
dt.fit(X_train1,Y_train)

In [624]:
predict = dt.predict(X_test1)

In [625]:
from sklearn.metrics import accuracy_score

In [626]:
accuracy = accuracy_score(Y_test,predict)
accuracy

1.0

### Pipeline

In [627]:
## Steps of pipeline

#Step #1 - Missing value imputation
#Step #2 - One hot encoding
#Step #3 - Scaling
#Step #4 - Feature selection
#Step #5 - Training

#### Step #1 : Train test split

In [665]:
X = df.drop("survived",axis=1)
Y = df["survived"]

In [666]:
cat_col = X.select_dtypes(exclude=['int','float']).columns.tolist()
cont_col = X.select_dtypes(include=['int','float']).columns.tolist()

In [667]:
from sklearn.model_selection import train_test_split

In [668]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=1)

In [669]:
X_train.shape

(623, 14)

#### Step #2 : Missing value imputation

In [670]:
list(X_train.columns).index('embark_town')

11

In [671]:
X_train.isnull().sum()[X_train.isnull().sum()!=0]

age            127
embarked         1
deck           488
embark_town      1
dtype: int64

In [672]:
s1 = ("impute_age",SimpleImputer(),[2])
s2 = ("impute_cat",SimpleImputer(strategy="most_frequent"),[6,10,11])

In [684]:
trf1 = ColumnTransformer(transformers=[s1,s2],remainder='passthrough')

In [685]:
trf1_data = trf1.fit_transform(X_train)

In [690]:
trf1_dataframe = pd.DataFrame(data=trf1_data,columns=trf1.get_feature_names_out())
trf1_dataframe.head()

Unnamed: 0,impute_age__age,impute_cat__embarked,impute_cat__deck,impute_cat__embark_town,remainder__pclass,remainder__sex,remainder__sibsp,remainder__parch,remainder__fare,remainder__class,remainder__who,remainder__adult_male,remainder__alive,remainder__alone
0,17.0,C,C,Cherbourg,3,female,0,0,14.4583,Third,woman,False,no,True
1,28.0,C,C,Cherbourg,2,female,1,0,24.0,Second,woman,False,yes,False
2,30.082661,S,C,Southampton,3,male,0,0,7.8958,Third,man,True,no,True
3,20.0,S,C,Southampton,3,male,0,0,9.8458,Third,man,True,no,True
4,30.082661,S,C,Southampton,2,male,0,0,0.0,Second,man,True,no,True


In [692]:
cat_col1

['impute_age__age',
 'impute_cat__embarked',
 'impute_cat__deck',
 'impute_cat__embark_town',
 'remainder__pclass',
 'remainder__sex',
 'remainder__sibsp',
 'remainder__parch',
 'remainder__fare',
 'remainder__class',
 'remainder__who',
 'remainder__adult_male',
 'remainder__alive',
 'remainder__alone']

In [637]:
#0 - Age 
#1 - embarked
#2 - deck
#3 - embark_town
#4 - pclass
#5 - sex
#6 - sibsp
#7 - parch
#8 - fare
#9 - class
#10 - who
#11 - adult_male
#12 - alive
#13 - alone

#### Step #3 : One hot encoding

In [638]:
trf2 = ColumnTransformer(transformers=[('ohe_cat',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,2,3,5,9,10,11,12,13])],remainder='passthrough')
#trf2 = ColumnTransformer(transformers=[('ohe_cat',OneHotEncoder(sparse=False,handle_unknown='ignore',),["sex","embarked","class"])],remainder='passthrough')

#### Step #4 : Scaling

In [639]:
m=[]
for i in [1,6,7,8,9,10,11,13]:
    m.append(X_train[X_train.columns[i]].nunique()) 

In [640]:
len(X_train.columns) - len([1,6,7,8,9,10,11,13])

6

In [641]:
6 + sum(m)

31

In [642]:
from sklearn.preprocessing import MinMaxScaler

In [643]:
trf3 = ColumnTransformer(transformers=[('Scaling',MinMaxScaler(),list(range(0,32)))])

#### Step #5 : Feature selection

In [644]:
trf4 = SelectKBest(score_func=chi2,k=10)

#### Step #6 : Train the model

In [645]:
trf5 = DecisionTreeClassifier()

#### Pipeline

In [646]:
pipe = Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4),('trf5',trf5)])

In [649]:
pipe.fit(X_train,Y_train)

## Exploring the pipeline

In [661]:
pipe.named_steps['trf1'].transformers_[1][1].statistics_

array(['S', 'C', 'Southampton'], dtype=object)

#### Prediction using pipeline

In [662]:
y_pred = pipe.predict(X_test)

In [663]:
accuracy_score(Y_test,y_pred)

1.0

### Cross validation using pipeline

In [664]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(pipe,X_train,Y_train,cv=5,scoring='accuracy')
cv_score.mean()

1.0

### Pipeline on Housing Data set

In [724]:
import pandas as pd
import numpy as np

In [742]:
df = pd.read_csv("C:\\Users\\yashs\\OneDrive\\Desktop\\Kaggle\\House Prices Regression\\house-prices-advanced-regression-techniques\\train.csv",index_col='Id')

In [743]:
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [798]:
df.columns
X = df.drop("SalePrice",axis=1)
Y = df["SalePrice"]

In [745]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=1)

#### Missing value imputation

In [746]:
#Step #1 - Separate categorical and continuous columns

In [747]:
def cat_cont_indices(df):
    cat_indices = dict()
    cont_indices = dict()
    cat_col = df.select_dtypes(exclude=['int','float']).columns.tolist()
    cont_col = df.select_dtypes(include=['int','float']).columns.tolist()
    for i in cat_col:
        m=list(df.columns).index(i)
        cat_indices.update({i:m})
    for i in cont_col:
        m=list(df.columns).index(i)
        cont_indices.update({i:m})
    return cat_indices,cont_indices

In [748]:
cat_indices,cont_indices=cat_cont_indices(df)

In [749]:
null_cont_indices = []
for i in cont_indices.values():
    if df.iloc[:,i].isnull().sum()!=0:
        null_cont_indices.append(i)

In [750]:
null_cat_indices = []
for i in cat_indices.values():
    if df.iloc[:,i].isnull().sum()!=0:
        null_cat_indices.append(i)

In [712]:
#Step #2 - Impute continous columns with median and categorical columns with most frequent value

In [719]:
s1 = ("Impute_cont",SimpleImputer(strategy='median'),null_cont_indices)
s2 = ("Impute_cat",SimpleImputer(strategy='most_frequent'),null_cat_indices)

In [720]:
trf1 = ColumnTransformer(transformers=[s1,s2],remainder='passthrough')

In [721]:
# Step #3 - Outliers detection

In [None]:
s3 = ("Outlier_detection")

In [754]:
class CustomerDetails():
    def __init__(self, customer_id,customer_name,customer_account_no,customer_account_type):
        self.id=customer_id
        self.name=customer_name
        self.account_no=customer_account_no
        self.account_type=customer_account_type
    def printdetails(self):
        print("The name of the customer is {}. Account number of the customer is {}. Customer ID is {} and Account type is {}".format(self.name,self.account_no,self.id,self.account_type))

In [755]:
Yash = CustomerDetails(69158365,"Yash Saxena",50120000404,"Savings")

In [756]:
Yash.printdetails()

The name of the customer is Yash Saxena. Account number of the customer is 50120000404. Customer ID is 69158365 and Account type is Savings


In [758]:
from sklearn.base import BaseEstimator,TransformerMixin

In [760]:
def DropFeatureSelector(df,variables):
    df = df.drop(variables,axis=1)
    return df

In [771]:
class DropFeatureSelector(BaseEstimator,TransformerMixin):
    def __init__(self,variables):
        self.variables=variables
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X_dropped = X.drop(self.variables,axis=1)
        self.columns = X_dropped.columns
        return X_dropped

In [799]:
class OneHotEncoderCustom(BaseEstimator,TransformerMixin):
    def __init__(self,variables):
        self.variables = variables
        self.ohe = OneHotEncoder(drop='first',handle_unknown='ignore')
    def fit(self,X,y=None):
        X_ = X.loc[:,self.variables]
        self.ohe.fit(X_)
        return self
    def transform(self,X):
        X_ = X.loc[:,self.variables]
        X_transformed = pd.DataFrame(data=self.ohe.transform(X_).toarray(),columns=self.ohe.get_feature_names_out())
        
        X.drop(self.variables,axis=1,inplace=True)
        
        X[self.ohe.get_feature_names_out()] = X_transformed.values
        return X

In [800]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [801]:
ohe = OneHotEncoderCustom(variables=["LotShape","Utilities"])

In [802]:
df1=ohe.fit_transform(X)

In [803]:
df1.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
      