In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

In [None]:
data=pd.read_csv("T124OPPE2_Preprocessing_V1.csv")
data.shape

(4000, 11)

In [None]:
(data["Gender"]).value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Female,2366
Male,1627
Unknown,7


In [None]:
len(data[data['Age'] <= 0])

8

In [None]:
len(data[data['GlucoseLevel'] <= 0])

9

In [None]:
(data["LivesIn"]).value_counts()

Unnamed: 0_level_0,count
LivesIn,Unnamed: 1_level_1
City,2030
Village,1965
Unknown,5


In [None]:
data['BMI'].isnull().sum()

149

In [None]:
(data["SmokingStatus"]).value_counts()

Unnamed: 0_level_0,count
SmokingStatus,Unnamed: 1_level_1
never smoked,1502
Unknown,1204
formerly smoked,697
smokes,597


In [None]:
data=data.dropna()

In [None]:
data["BMI"].mean()

28.857958971695663

In [None]:
filtered_data = data[(data['LivesIn'] == 'City') &
                 (data['SmokingStatus']!="never smoked") &
                 (data['HeartAttack'] == "Yes")]
filtered_data.shape

(54, 11)

In [None]:
data["HeartAttack"].value_counts()

Unnamed: 0_level_0,count
HeartAttack,Unnamed: 1_level_1
No,3689
Yes,162


In [None]:
X=data.drop(columns="HeartAttack")
y=data["HeartAttack"]

In [None]:
y[y=="Yes"]=1
y[y=="No"]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y=="No"]=0


In [None]:
y.head()

Unnamed: 0,HeartAttack
0,0
1,0
2,0
3,1
4,0


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=0,stratify=y)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2695, 10), (1156, 10), (2695,), (1156,))

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import FunctionTransformer

reshape_to_2d = FunctionTransformer(lambda x: x.reshape(-1, 1), validate=True)

# Preprocessing for each column
# Gender: Impute with most frequent, then ordinal encode
gender_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder(categories=[['Male', 'Female']]))  # Ensure proper order
])

# Age: Impute with mean, then standard scale
age_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

# HasTension: Ordinal encode
has_tension_pipeline = Pipeline(steps=[
    ('encode', OrdinalEncoder(categories=[[0, 1]]))  # Assuming binary values
])

# AnyHeartDisease: Ordinal encode
any_heart_disease_pipeline = Pipeline(steps=[
    ('encode', OrdinalEncoder(categories=[[0, 1]]))  # Assuming binary values
])

# NeverMarried: Ordinal encode
never_married_pipeline = Pipeline(steps=[
    ('encode', OrdinalEncoder(categories=[[0, 1]]))  # Assuming binary values
])

occupation_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(categories=[['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children']],
                             drop=None, sparse_output=False))
])


# LivesIn: Impute with most frequent, then ordinal encode
lives_in_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder(categories=[['Urban', 'Rural']]))  # Ensure proper order
])

# GlucoseLevel: Impute with mean, then min-max scale
glucose_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

# BMI: Impute with mean, then standard scale
bmi_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

# SmokingStatus: Impute with most frequent, then one-hot encode
smoking_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(categories=[['formerly smoked', 'never smoked', 'smokes']],
                             drop=None, sparse_output=False))
])

# Combine all preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('gender', gender_pipeline, 'Gender'),
    ('age', age_pipeline, 'Age'),
    ('has_tension', has_tension_pipeline, 'HasTension'),
    ('any_heart_disease', any_heart_disease_pipeline, 'AnyHeartDisease'),
    ('never_married', never_married_pipeline, 'NeverMarried'),
    ('occupation', occupation_pipeline, 'Occupation'),
    ('lives_in', lives_in_pipeline, 'LivesIn'),
    ('glucose', glucose_pipeline, 'GlucoseLevel'),
    ('bmi', bmi_pipeline, 'BMI'),
    ('smoking', smoking_pipeline, 'SmokingStatus')
], remainder='drop')


# Apply the pipeline
processed_data = preprocessor.fit_transform(data)

# Create DataFrame with specified column order
columns = [
    'Gender', 'Age', 'HasTension', 'AnyHeartDisease', 'NeverMarried',
    'Occupation_Govt_job', 'Occupation_Never_worked', 'Occupation_Private', 'Occupation_Self-employed', 'Occupation_children',
    'LivesIn', 'GlucoseLevel', 'BMI',
    'SmokingStatus_formerly smoked', 'SmokingStatus_never smoked', 'SmokingStatus_smokes'
]

processed_df = pd.DataFrame(processed_data, columns=columns)
processed_df.shape

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.