In [1]:
import pandas as pd

# Load Data
df = pd.read_csv(r"C:\Users\HP\Desktop\churn_analysis\churn_analysis.csv")  

# Drop unnecessary columns
df.drop(['customerID', 'IsSenior'], axis=1, inplace=True)

# Convert 'TotalCharges' to numeric (handle errors)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)

# Clean 'No internet service' and 'No phone service' → 'No'
cols_to_clean = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in cols_to_clean:
    if col in df.columns:  # ✅ Prevent KeyError
        df[col] = df[col].replace({'No internet service': 'No', 'No phone service': 'No'})

# Binary columns mapping
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies',
               'PaperlessBilling', 'Churn']

for col in binary_cols:
    if col in df.columns:  # ✅ Prevent KeyError
        df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 0, 'Female': 1})

# One-hot encoding for multiclass categories
onehot_cols = ['InternetService', 'Contract', 'PaymentMethod']

df = pd.get_dummies(df, columns=onehot_cols, drop_first=True)

# Final check
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 28 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7032 non-null   int64  
 1   SeniorCitizen                          7032 non-null   int64  
 2   Partner                                7032 non-null   int64  
 3   Dependents                             7032 non-null   int64  
 4   tenure                                 7032 non-null   int64  
 5   PhoneService                           7032 non-null   int64  
 6   MultipleLines                          7032 non-null   int64  
 7   OnlineSecurity                         7032 non-null   int64  
 8   OnlineBackup                           7032 non-null   int64  
 9   DeviceProtection                       7032 non-null   int64  
 10  TechSupport                            7032 non-null   int64  
 11  Stre

In [2]:
df.to_csv('final_encoded_data.csv', index=False)
print("✅ Feature-engineered data saved.")

✅ Feature-engineered data saved.


In [3]:
print(df.dtypes.value_counts())

int64      17
bool        7
float64     2
object      2
Name: count, dtype: int64


In [4]:
# Identify object-type columns
print(df.select_dtypes(include='object').columns)


Index(['plan_change_recently', 'plan_change_recently.1'], dtype='object')


In [5]:
print((df['plan_change_recently'] == df['plan_change_recently.1']).all())


False


In [6]:
# Convert Yes/No to 1/0 for both columns
df['plan_change_recently'] = df['plan_change_recently'].map({'Yes': 1, 'No': 0})
df['plan_change_recently.1'] = df['plan_change_recently.1'].map({'Yes': 1, 'No': 0})


In [7]:
print(df[['plan_change_recently', 'plan_change_recently.1']].head())
print(df.dtypes.value_counts())


   plan_change_recently  plan_change_recently.1
0                     1                       1
1                     1                       0
2                     0                       1
3                     1                       0
4                     0                       0
int64      19
bool        7
float64     2
Name: count, dtype: int64
