## Librairies Importing & Dataset Loading 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('merged_dataset_VF03.csv')

## Encoding 

In [None]:
def find_yes_no_columns(df):
    yes_no_columns = []
    
    for col in df.columns:
        unique_values = set(df[col].astype(str).str.lower())
        if 'yes' in unique_values and 'no' in unique_values:
            yes_no_columns.append(col)
    
    return yes_no_columns

yes_no_cols = find_yes_no_columns(df)
print(f"Columns with 'yes'/'no' values: {yes_no_cols}")

In [None]:
for column in df.columns:
    if df[column].isin(['Yes', 'No']).all(): 
        df[column] = df[column].map({'Yes': 1, 'No': 0})

In [None]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
for column in categorical_columns:
    print(f"Column: {column}")
    value_counts = df[column].value_counts()
    print("Unique values and frequencies:")
    print(value_counts)
    print("\n" + "-"*40 + "\n")


In [None]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Contract'] = label_encoder.fit_transform(df['Contract'])

In [None]:
def weighted_mean_target_encoding(df, feature, target):
    
    mean_target = df.groupby(feature)[target].mean()
    
    category_counts = df[feature].value_counts()
    
    weighted_mean = mean_target * category_counts / category_counts.sum()
    
    df[feature] = df[feature].map(weighted_mean)
    
    return df

df = weighted_mean_target_encoding(df, 'Offer', 'Churn Value')
df = weighted_mean_target_encoding(df, 'Internet Type', 'Churn Value')
df = weighted_mean_target_encoding(df, 'Payment Method', 'Churn Value')


df.head()


## Outliers Detection

In [None]:
X = df.select_dtypes(include=[np.number])
X = X.drop(columns=['Churn Value'])

z_scores = np.abs((X - X.mean()) / X.std(ddof=0))
threshold = 3
outliers = z_scores > threshold
print(f"Outliers detected per column:\n{outliers.sum(axis=0)}")


## Outliers Handling

In [None]:
columns_to_visualize = [
 'Total Refunds',
 'Total Extra Data Charges',
 'Total Revenue',
 'Total Long Distance Charges',
 'Avg Monthly GB Download'
]

df_selected = df[columns_to_visualize]

skewness = df_selected.skew()
print("Skewness of selected columns:")
print(skewness)


In [None]:
columns_to_log_transform = [
    'Total Revenue', 
    'Total Long Distance Charges', 
    'Avg Monthly GB Download'
]

for column in columns_to_log_transform:
    df[column] = np.log(df[column] + 1)  

print(df[['Total Revenue', 
          'Total Long Distance Charges', 
          'Avg Monthly GB Download']].skew())


In [None]:
columns_to_boxcox = [
    'Total Refunds',
    'Total Extra Data Charges'
]

for column in columns_to_boxcox:
    if (df[column] <= 0).any():
        df[column] = df[column] + 1
    df[column], _ = stats.boxcox(df[column])

print(df[['Total Refunds',
    'Total Extra Data Charges']].skew())


## Feature Engineering

1. Tenure-Based Features

Tenure Group: Categorizing customers based on tenure:

New (0-6 months)

Short-term (7-12 months)

Medium-term (13-24 months)

Long-term (>24 months)

In [None]:
df['Tenure Group'] = pd.cut(df['Tenure in Months'], 
                            bins=[0, 6, 12, 24, df['Tenure in Months'].max()], 
                            labels=['New', 'Short-term', 'Medium-term', 'Long-term'])

In [None]:
encoder = LabelEncoder()
df['Tenure Group'] = encoder.fit_transform(df['Tenure Group'])

In [None]:
df['Total Extra Charges Ratio'] = df['Total Extra Data Charges'] / (df['Total Charges'] + 1)

In [None]:
df['Security Bundle'] = (df['Online Security'] + df['Online Backup'] + 
                         df['Device Protection Plan'] + df['Premium Tech Support'])

In [None]:
df['Multiple Services Score'] = (df['Phone Service'] + df['Multiple Lines'] + 
                                 df['Streaming TV'] + df['Streaming Movies'] + 
                                 df['Streaming Music'])

In [None]:
correlation_matrix = df.corr()
correlated_features = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.85:
            colname = correlation_matrix.columns[i]
            rowname = correlation_matrix.columns[j]
            correlated_features.add((colname, rowname))

print("Correlated Features (correlation > 0.85):")
for pair in correlated_features:
    print(pair)

plt.figure(figsize=(30, 30))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
df.drop(columns=['Dependents'], inplace=True)

## Data Scaling/Normalization

In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

binary_cols = [col for col in numeric_cols if df[col].nunique() == 2]

continuous_numeric_cols = [col for col in numeric_cols if col not in binary_cols]

print("Continuous Numerical Features:", continuous_numeric_cols)


In [None]:
columns_to_standardize = [
    'Tenure in Months', 'Offer', 'Avg Monthly Long Distance Charges', 
    'Internet Type', 'Avg Monthly GB Download', 'Payment Method', 
    'Total Extra Data Charges', 'Total Long Distance Charges', 
    'Satisfaction Score', 'CLTV', 'Age', 'Number of Dependents'
]

scaler = StandardScaler()

df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

print("Standardized Features:")
print(df[columns_to_standardize].describe())


## Feature Selection

In [None]:
correlation_matrix = df.corr()
correlated_features = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.85:
            colname = correlation_matrix.columns[i]
            rowname = correlation_matrix.columns[j]
            correlated_features.add((colname, rowname))

print("Correlated Features (correlation > 0.85):")
for pair in correlated_features:
    print(pair)

plt.figure(figsize=(30, 30))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

(‘Total Revenue’, ‘Tenure in Months’)
Reason for Correlation: Customers who have stayed longer (higher Tenure in Months) tend to generate higher Total Revenue.

Which to Keep?
Drop Total Revenue, Keep Tenure in Months

Tenure in Months is a more fundamental feature for churn prediction.

Total Revenue is derived from tenure and charges, so it's redundant.
_______________________________

(‘Number of Dependents’, ‘Dependents’)
Reason for Correlation: Number of Dependents is a numerical count, while Dependents is usually a binary indicator (Yes/No).

Which to Keep?
Drop Dependents, Keep Number of Dependents

Number of Dependents has more information (it provides exact numbers rather than just Yes/No).

If Dependents is already binary (0 for No, 1 for Yes), it's less useful than an exact count.
_____________________________

(‘Total Revenue’, ‘Total Charges’)
Reason for Correlation:

Total Revenue is often the sum of Total Charges and other components.

If Total Revenue = Total Charges + Extra Fees, one is redundant.

Which to Keep?
Drop Total Revenue, Keep Total Charges

Total Charges reflects how much the customer has been billed, which can directly influence churn.

Total Revenue might include additional elements that don’t add much predictive value.




In [None]:
df.drop(columns=['Dependents', 'Total Revenue'], inplace=True)

In [None]:
X = df.drop(columns=["Churn Value"])
y = df["Churn Value"]

model = LogisticRegression(max_iter=1000, random_state=42)

n_features_to_select = 10
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
rfe.fit(X, y)

ranking_df = pd.DataFrame({
    "Feature": X.columns,
    "Ranking": rfe.ranking_,
    "Selected": rfe.support_
}).sort_values(by="Ranking")

print("\nRFE Feature Ranking:")
print(ranking_df.to_string(index=False))

selected = ranking_df[ranking_df["Selected"]]["Feature"].tolist()
print(f"\nTop {len(selected)} Selected Features:")
for i, feature in enumerate(selected, 1):
    print(f"{i}. {feature}")


In [None]:
df.drop(columns=['Gender','Total Charges', 'Monthly Charge', 'Total Refunds' ], inplace=True)

## PCA

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

pca = PCA()
pca.fit(X)

explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, 'o-', markersize=4)
plt.title('Scree Plot: Variance Explained by Each Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'o-', markersize=4)
plt.title('Cumulative Explained Variance vs. Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

For determining the optimal number of components, I'd recommend:

From the scree plot: The elbow appears to be around 5-6 components. After this point, each additional component adds much less new information.
From the cumulative variance plot: With about 10 components, you're capturing approximately 80% of the variance, and with 15 components you're at roughly 90-92%.

The optimal number depends on your specific goals:

If you need to be very strict about dimensionality reduction, 5-6 components would be reasonable as that's where the elbow appears most pronounced.
If you want to ensure you capture most of the variance, 10-15 components would be appropriate.

A common compromise is to select the number of components that explains 80-90% of the variance, which in your case would be around 10-12 components.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA()
pca.fit(X)

eigenvalues = pca.explained_variance_

n_components_kaiser = sum(eigenvalues > 1)
print(f"Number of components according to Kaiser's Rule: {n_components_kaiser}")

plt.figure(figsize=(10, 6))
plt.bar(range(1, len(eigenvalues) + 1), eigenvalues)
plt.axhline(y=1, color='r', linestyle='--', label='Eigenvalue = 1')
plt.xlabel('Component Number')
plt.ylabel('Eigenvalue')
plt.legend()
plt.title('Scree Plot with Kaiser\'s Rule')
plt.grid(True)
plt.show()

pca_final = PCA(n_components=n_components_kaiser)
X_pca = pca_final.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

# X_pca now contains your transformed data with 10 components
print(f"Original data shape: {X.shape}")
print(f"Reduced data shape: {X_pca.shape}")

# To see how much variance is explained by these 10 components
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {np.sum(pca.explained_variance_ratio_)}")

# If you want to know which original features contribute most to each component
# Look at the components (loadings)
feature_names = X.columns if hasattr(X, 'columns') else [f"Feature {i}" for i in range(X.shape[1])]
components_df = pd.DataFrame(pca.components_.T, index=feature_names)

In [None]:
for i, component in enumerate(pca.components_):
    sorted_indices = np.argsort(np.abs(component))[::-1]
    print(f"\nTop 5 features in PC{i+1}:")
    for idx in sorted_indices[:5]:
        print(f"{feature_names[idx]}: {component[idx]:.3f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline as imPipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('V2_ZscoreScaled.csv')
X = df.drop(['Churn Value'], axis=1)
y = df['Churn Value']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42)

print("Before SMOTE:", Counter(y_train))
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_original, y_train_balanced = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train_balanced))

pca = PCA(n_components=10)
pca.fit(X_train_original) 
X_train_pca = pca.transform(X_train_original)
X_test_pca = pca.transform(X_test)

def train_evaluate_svc(X_train, X_test, y_train, y_test, model_name):
 
    baseline_pipeline = imPipeline(steps=[
        ('classifier', SVC(
            kernel='poly',
            degree=2,
            C=0.5, 
            gamma='scale',  
            probability=True  
        ))
    ])
    
    baseline_pipeline.fit(X_train, y_train)
    
    y_baseline_pred = baseline_pipeline.predict(X_test)
    y_baseline_proba = baseline_pipeline.predict_proba(X_test)[:, 1]
    
    metrics = {
        "Accuracy": round(accuracy_score(y_test, y_baseline_pred), 2),
        "ROC AUC": round(roc_auc_score(y_test, y_baseline_proba), 2),
        "Log Loss": round(log_loss(y_test, y_baseline_proba), 2),
        "F1 Score": round(f1_score(y_test, y_baseline_pred), 2),
        "Precision": round(precision_score(y_test, y_baseline_pred), 2),
        "Recall": round(recall_score(y_test, y_baseline_pred), 2)
    }
    
    print(f"{model_name} Evaluation:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print("\nClassification Report:\n", classification_report(y_test, y_baseline_pred))
    print(f"{model_name} modeling complete.\n")
    
    conf_matrix = confusion_matrix(y_test, y_baseline_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["No Churn", "Churn"], yticklabels=["No Churn", "Churn"])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show()
    
    return metrics, baseline_pipeline

original_metrics, original_model = train_evaluate_svc(X_train_original, X_test, y_train_balanced, y_test, "Original SVC Model")

pca_metrics, pca_model = train_evaluate_svc(X_train_pca, X_test_pca, y_train_balanced, y_test, "PCA-Based SVC Model")

comparison_df = pd.DataFrame({
    "Original Model": original_metrics,
    "PCA Model": pca_metrics
})

print("Model Performance Comparison:")
print(comparison_df)

plt.figure(figsize=(12, 6))
comparison_df.plot(kind='bar')
plt.title('Performance Comparison: Original vs PCA-based SVC Model')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Model')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline as imPipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('V2_ZscoreScaled.csv')
X = df.drop(['Churn Value'], axis=1)
y = df['Churn Value']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42)

print("Before SMOTE:", Counter(y_train))
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_original, y_train_balanced = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train_balanced))

pca = PCA(n_components=10)
pca.fit(X_train_original) 
X_train_pca = pca.transform(X_train_original)
X_test_pca = pca.transform(X_test)

def train_evaluate_rf(X_train, X_test, y_train, y_test, model_name):
 
    baseline_pipeline = imPipeline(steps=[
        ('classifier', RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42
        ))
    ])
    
    baseline_pipeline.fit(X_train, y_train)
    
    y_baseline_pred = baseline_pipeline.predict(X_test)
    y_baseline_proba = baseline_pipeline.predict_proba(X_test)[:, 1]
    
    metrics = {
        "Accuracy": round(accuracy_score(y_test, y_baseline_pred), 2),
        "ROC AUC": round(roc_auc_score(y_test, y_baseline_proba), 2),
        "Log Loss": round(log_loss(y_test, y_baseline_proba), 2),
        "F1 Score": round(f1_score(y_test, y_baseline_pred), 2),
        "Precision": round(precision_score(y_test, y_baseline_pred), 2),
        "Recall": round(recall_score(y_test, y_baseline_pred), 2)
    }
    
    print(f"{model_name} Evaluation:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print("\nClassification Report:\n", classification_report(y_test, y_baseline_pred))
    print(f"{model_name} modeling complete.\n")
    
    conf_matrix = confusion_matrix(y_test, y_baseline_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["No Churn", "Churn"], yticklabels=["No Churn", "Churn"])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show()
    
    return metrics, baseline_pipeline

original_metrics, original_model = train_evaluate_rf(X_train_original, X_test, y_train_balanced, y_test, "Original RF Model")

pca_metrics, pca_model = train_evaluate_rf(X_train_pca, X_test_pca, y_train_balanced, y_test, "PCA-Based RF Model")

comparison_df = pd.DataFrame({
    "Original Model": original_metrics,
    "PCA Model": pca_metrics
})

print("Model Performance Comparison:")
print(comparison_df)

plt.figure(figsize=(12, 6))
comparison_df.plot(kind='bar')
plt.title('Performance Comparison: Original vs PCA-based Random Forest Model')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Model')
plt.tight_layout()
plt.show()