In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/raw/Telco-Customer-Churn.csv") 
df.drop_duplicates(inplace=True)

In [3]:
for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [4]:
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [5]:
if 'age' in df.columns:
    df["is_adult"] = df["age"].apply(lambda x: 1 if x >= 18 else 0)


In [6]:
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year



In [7]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

In [8]:
label_enc = LabelEncoder()
for col in categorical_cols:
    if df[col].nunique() == 2:
        df[col] = label_enc.fit_transform(df[col])

In [9]:

df = pd.get_dummies(df, columns=[col for col in categorical_cols if df[col].nunique() > 2], drop_first=True)

In [10]:
scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [11]:
target = 'target_column_name'  
if target in numeric_cols:
    numeric_cols.remove(target)

In [12]:
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.to_csv("../data/processed/processed_data.csv", index=False)

print("✅ Feature engineering complete. Processed data saved to data/processed/")

✅ Feature engineering complete. Processed data saved to data/processed/
