In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [25]:
df = pd.read_csv('./data/Telco Customer Churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# print(df.head())
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# errors='coerce' is a good approach to handle non-numeric values by replacing them with NaN.


In [26]:
def fill_missing_values(df, option : str='mean'):
    """Fill missing values in data frame"""
    
    missing_values = df.isna().sum()
    # print(missing_values)

    # Check if there are any missing values
    if missing_values.any():
        print("There are missing values in the dataset.")
        # Display the count of missing values for each column
        # find the columns with missing values
        columns_with_missing_values = df.columns[missing_values > 0]
        print("Missing values per column:")
        print(missing_values[columns_with_missing_values])

        # fix the missing values
        # we can use df.replace(to_replace=' ', value=-1) to replace all missing values with -1
        for column in columns_with_missing_values:
       
            if option == 'drop':
                df.dropna()
            elif option == 'mean':
                df[column].fillna(df[column].mean(), inplace=True)
            elif option == 'median':
                df[column].fillna(df[column].median(), inplace=True)
            elif option == 'mode':
                df[column].fillna(df[column].mode()[0], inplace=True)
            elif option == 'linear':
                df[column].interpolate(method='linear', limit_direction='forward', inplace=True)
            elif option == 'quadratic':
                df[column].interpolate(method='quadratic', limit_direction='forward', inplace=True)
            elif option == 'cubic':
                df[column].interpolate(method='cubic', limit_direction='forward', inplace=True)
            elif option == 'spline':
                df[column].interpolate(method='spline', order=3, limit_direction='forward', inplace=True)
        
        print("Missing values per column are fixed\n")

    else:
        print("There are no missing values in the dataset.\n")
       


In [27]:
learn_df = df.copy()

fill_missing_values(learn_df, option='mean')



numeric_cols = learn_df.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove("SeniorCitizen")
categorical_cols = learn_df.select_dtypes(exclude=np.number).columns.tolist()

# print(numeric_cols)

# Normalization for numeric columns
scaler = StandardScaler()
learn_df[numeric_cols] = scaler.fit_transform(learn_df[numeric_cols])

# Label Encoding for categorical columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    learn_df[col] = label_encoders[col].fit_transform(learn_df[col])

There are missing values in the dataset.
Missing values per column:
TotalCharges    11
dtype: int64
Missing values per column are fixed

['tenure', 'MonthlyCharges', 'TotalCharges']


In [30]:
# Ranking features
from sklearn.feature_selection import SelectKBest,  mutual_info_classif

# split the last column of the array fot the label array and the rest for training
X = learn_df.drop(columns=['Churn']) # features
y = learn_df['Churn'] # label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=82)
# random_state=82 is used to ensure that the split is always the same

# rank features using mutual_info_classif
selector = SelectKBest(mutual_info_classif, k=5)
selector.fit(X_train, y_train)
scores = selector.scores_
scores = pd.DataFrame(scores, index=X_train.columns, columns=['Score'])
scores.sort_values(by='Score', ascending=False, inplace=True)
scores



Unnamed: 0,Score
Contract,0.093678
tenure,0.076211
TechSupport,0.057583
InternetService,0.0539
PaymentMethod,0.053673
OnlineSecurity,0.051807
TotalCharges,0.045478
MonthlyCharges,0.045396
OnlineBackup,0.042347
StreamingMovies,0.042199
