In [None]:
import zipfile

with zipfile.ZipFile('/content/ultimate-customer-churn-prediction-challenge.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

In [None]:
import pandas as pd

df = pd.read_csv('train.csv')

In [None]:
display(df.head())

Unnamed: 0,Customer_ID,Age,Gender,Location,Subscription_Type,Account_Age_Months,Monthly_Spending,Total_Usage_Hours,Support_Calls,Late_Payments,Streaming_Usage,Discount_Used,Satisfaction_Score,Last_Interaction_Type,Complaint_Tickets,Promo_Opted_In,Churn
0,1001,19,Male,Illinois,Basic,50,152.44,416,5,2,61,76,3,Neutral,0,1,1
1,1002,41,Male,California,Premium,14,113.34,36,5,1,17,90,5,Negative,3,0,0
2,1003,44,Female,Florida,Basic,2,168.39,207,3,1,85,12,6,Neutral,3,0,1
3,1004,21,Male,Florida,Basic,55,197.12,379,4,3,54,32,4,Positive,3,1,0
4,1005,65,Male,New York,Premium,12,84.46,475,5,4,82,62,1,Neutral,0,0,1


In [None]:
import pandas as pd

# Reload the training data to ensure a clean state
df = pd.read_csv('train.csv')

# Impute missing values with the mode before mapping
for col in ['Gender', 'Subscription_Type', 'Last_Interaction_Type']:
    # Check if the column has any non-NaN values to calculate the mode
    if not df[col].isnull().all():
        mode_value = df[col].mode()[0] # Get the first mode in case of ties
        df[col] = df[col].fillna(mode_value)
    else:
        # If the column is all NaN, decide on a different imputation strategy or handle as needed
        # For now, let's print a warning and leave them as NaN or fill with a placeholder
        print(f"Warning: Column '{col}' is entirely NaN. Mode imputation skipped.")
        # Optionally, fill with a placeholder like a new category or a specific value
        # df[col] = df[col].fillna('Missing')

# Map 'Gender', 'Subscription_Type', and 'Last_Interaction_Type' to numerical values
print("Unique values in Gender:", df['Gender'].unique())
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

print("Unique values in Subscription_Type:", df['Subscription_Type'].unique())
df['Subscription_Type'] = df['Subscription_Type'].map({'Basic': 0, 'Premium': 1})

print("Unique values in Last_Interaction_Type:", df['Last_Interaction_Type'].unique())
df['Last_Interaction_Type'] = df['Last_Interaction_Type'].map({'Neutral': 0, 'Negative': 1, 'Positive': 2})

# Keep 'Location' as a string column to be handled by OneHotEncoder later


# Split data into features (X) and target (y)
X = df.drop(['Churn', 'Customer_ID'], axis=1)
y = df['Churn']

display(X.head())
display(y.head())

Unique values in Gender: ['Male' 'Female']
Unique values in Subscription_Type: ['Basic' 'Premium' 'Enterprise']
Unique values in Last_Interaction_Type: ['Neutral' 'Negative' 'Positive']


Unnamed: 0,Age,Gender,Location,Subscription_Type,Account_Age_Months,Monthly_Spending,Total_Usage_Hours,Support_Calls,Late_Payments,Streaming_Usage,Discount_Used,Satisfaction_Score,Last_Interaction_Type,Complaint_Tickets,Promo_Opted_In
0,19,0,Illinois,0.0,50,152.44,416,5,2,61,76,3,0,0,1
1,41,0,California,1.0,14,113.34,36,5,1,17,90,5,1,3,0
2,44,1,Florida,0.0,2,168.39,207,3,1,85,12,6,0,3,0
3,21,0,Florida,0.0,55,197.12,379,4,3,54,32,4,2,3,1
4,65,0,New York,1.0,12,84.46,475,5,4,82,62,1,0,0,0


Unnamed: 0,Churn
0,1
1,0
2,1
3,0
4,1


In [None]:
display(X.isnull().sum())

Unnamed: 0,0
Age,0
Gender,0
Location,0
Subscription_Type,817
Account_Age_Months,0
Monthly_Spending,0
Total_Usage_Hours,0
Support_Calls,0
Late_Payments,0
Streaming_Usage,0


In [None]:
X['Subscription_Type'] = X['Subscription_Type'].fillna(X['Subscription_Type'].median())
display(X.isnull().sum())

Unnamed: 0,0
Age,0
Gender,0
Location,0
Subscription_Type,0
Account_Age_Months,0
Monthly_Spending,0
Total_Usage_Hours,0
Support_Calls,0
Late_Payments,0
Streaming_Usage,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (6400, 15)
X_test shape: (1600, 15)
y_train shape: (6400,)
y_test shape: (1600,)


In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Define the numerical and categorical features
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = ['Location'] # Only 'Location' is left as categorical

# Remove 'Customer_ID' from numerical features
if 'Customer_ID' in numerical_features:
    numerical_features.remove('Customer_ID')

# Create transformers for numerical and categorical features
numerical_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Handle unseen categories in test set

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing to the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Apply preprocessing to the testing data
X_test_processed = preprocessor.transform(X_test)

# Let's display the processed data shape to confirm
print("Shape of processed training data:", X_train_processed.shape)
print("Shape of processed testing data:", X_test_processed.shape)

# Apply SMOTE to the processed training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)

print("Shape of original processed training data:", X_train_processed.shape)
print("Shape of balanced training data:", X_train_balanced.shape)

Shape of processed training data: (6400, 19)
Shape of processed testing data: (1600, 19)
Shape of original processed training data: (6400, 19)
Shape of balanced training data: (8818, 19)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Define the numerical and categorical features
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = ['Location'] # Only 'Location' is left as categorical

# Remove 'Customer_ID' from numerical features
if 'Customer_ID' in numerical_features:
    numerical_features.remove('Customer_ID')

# Create transformers for numerical and categorical features
numerical_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Handle unseen categories in test set

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing to the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Apply preprocessing to the testing data
X_test_processed = preprocessor.transform(X_test)

# Apply SMOTE to the processed training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)


# Create and train the AdaBoost classifier on the balanced data
# Added n_estimators and learning_rate parameters
adaboost_model_balanced = AdaBoostClassifier(n_estimators=700, learning_rate=0.5, random_state=42)
adaboost_model_balanced.fit(X_train_balanced, y_train_balanced)

# Make predictions on the processed test data
y_pred_adaboost_balanced = adaboost_model_balanced.predict(X_test_processed)

# Generate and display the classification report
report_adaboost_balanced = classification_report(y_test, y_pred_adaboost_balanced)
print("Classification report for AdaBoost model trained on balanced data:\n", report_adaboost_balanced)

Classification report for AdaBoost model trained on balanced data:
               precision    recall  f1-score   support

           0       0.68      0.97      0.80      1086
           1       0.33      0.03      0.05       514

    accuracy                           0.67      1600
   macro avg       0.51      0.50      0.43      1600
weighted avg       0.57      0.67      0.56      1600



We can use `GridSearchCV` to find the best hyperparameters for the AdaBoost model. We'll define a grid of hyperparameters to search over.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200, 500, 700],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}

# Create a GridSearchCV object
# We'll use the AdaBoostClassifier with the balanced training data
grid_search_adaboost = GridSearchCV(estimator=AdaBoostClassifier(random_state=42),
                                    param_grid=param_grid,
                                    scoring='f1', # Use f1-score as the scoring metric, which is suitable for imbalanced data
                                    cv=5,          # Use 5-fold cross-validation
                                    n_jobs=-1)     # Use all available cores

# Fit the GridSearchCV to the balanced training data
# Note: This might take some time depending on the size of the grid and data
print("Starting GridSearchCV...")
grid_search_adaboost.fit(X_train_balanced, y_train_balanced)
print("GridSearchCV finished.")

# Print the best parameters and the best score
print("Best parameters found: ", grid_search_adaboost.best_params_)
print("Best cross-validation f1-score: ", grid_search_adaboost.best_score_)

# Get the best model
best_adaboost_model = grid_search_adaboost.best_estimator_

# Evaluate the best model on the test data
y_pred_best_adaboost = best_adaboost_model.predict(X_test_processed)

# Generate and display the classification report for the best model
report_best_adaboost = classification_report(y_test, y_pred_best_adaboost)
print("Classification report for the best AdaBoost model on test data:\n", report_best_adaboost)

Starting GridSearchCV...
GridSearchCV finished.
Best parameters found:  {'learning_rate': 0.01, 'n_estimators': 100}
Best cross-validation f1-score:  0.6665667999896747
Classification report for the best AdaBoost model on test data:
               precision    recall  f1-score   support

           0       0.64      0.10      0.17      1086
           1       0.32      0.88      0.47       514

    accuracy                           0.35      1600
   macro avg       0.48      0.49      0.32      1600
weighted avg       0.54      0.35      0.26      1600



In [None]:
# Load the test data
test_df = pd.read_csv('test.csv')

# Preprocess the test data

# Map 'Gender', 'Subscription_Type', and 'Last_Interaction_Type' to numerical values
test_df['Gender'] = test_df['Gender'].map({'Male': 0, 'Female': 1})
test_df['Subscription_Type'] = test_df['Subscription_Type'].map({'Basic': 0, 'Premium': 1})
test_df['Last_Interaction_Type'] = test_df['Last_Interaction_Type'].map({'Neutral': 0, 'Negative': 1, 'Positive': 2})

# Handle missing values in 'Subscription_Type' using the median from the training data
# The median is calculated from the training data 'X'
test_df['Subscription_Type'] = test_df['Subscription_Type'].fillna(X['Subscription_Type'].median())

# Apply the same preprocessing (scaling and one-hot encoding for Location) to the test data
# Exclude 'Customer_ID' from the test data before transforming
test_df_processed = preprocessor.transform(test_df.drop('Customer_ID', axis=1))


# Make predictions on the processed test data using the AdaBoost model
test_preds_proba = adaboost_model_balanced.predict_proba(test_df_processed)[:, 1]

# Create the submission DataFrame
submission = pd.DataFrame({
    'Customer_ID': test_df['Customer_ID'],
    'Churn_Probability': test_preds_proba
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")

Submission file 'submission.csv' created successfully!


Next, you'll need to get your Kaggle API credentials. Follow these steps:

1. Go to your Kaggle account settings (`https://www.kaggle.com/<username>/account`).
2. Scroll down to the "API" section.
3. Click on "Create New API Token". This will download a `kaggle.json` file to your computer.

Now, you need to upload the `kaggle.json` file to your Colab environment and store the credentials securely.

1. In the Colab file browser (the folder icon on the left sidebar), click the "Upload to session storage" icon and upload the `kaggle.json` file you just downloaded.
2. In the Colab secrets manager (the key icon on the left sidebar), add two secrets:
    - Name: `KAGGLE_USERNAME`, Value: Your Kaggle username (from `kaggle.json`)
    - Name: `KAGGLE_KEY`, Value: Your Kaggle API key (from `kaggle.json`)

Finally, you can use the Kaggle API to submit your file. You'll need the competition name, which you can find in the URL of the Kaggle challenge page.

In [None]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi
from google.colab import userdata

# Set up Kaggle credentials from Colab secrets
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

# Authenticate with Kaggle
api = KaggleApi()
api.authenticate()

# Replace 'your-competition-name' with the actual competition name
competition_name = 'ultimate-customer-churn-prediction-challenge'
submission_file = 'submission.csv'
message = 'Submission from Colab notebook'

# Submit the file
#api.competition_submit(submission_file, message, competition_name)

print(f"Submitted {submission_file} to competition {competition_name} with message: '{message}'")

Submitted submission.csv to competition ultimate-customer-churn-prediction-challenge with message: 'Submission from Colab notebook'
