In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the dataset into a Pandas DataFrame
data = pd.read_csv('customer_churn_large_dataset.csv')  

# Remove columns that are not relevant for modeling
data.drop(['CustomerID', 'Name'], axis=1, inplace=True)

# Initial Exploration
print("Dataset shape:", data.shape)
print(data.info())
print(data.head())

# Handle Missing Data
imputer = SimpleImputer(strategy='mean')  
data['Monthly_Bill'] = imputer.fit_transform(data['Monthly_Bill'].values.reshape(-1, 1))

# Encode Categorical Variables (One-Hot Encoding)
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

# Split the data into features and target variable
X = data.drop('Churn', axis=1)
y = data['Churn']

# Split the dataset into training and testing sets (e.g., 80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)




Dataset shape: (100000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Age                         100000 non-null  int64  
 1   Gender                      100000 non-null  object 
 2   Location                    100000 non-null  object 
 3   Subscription_Length_Months  100000 non-null  int64  
 4   Monthly_Bill                100000 non-null  float64
 5   Total_Usage_GB              100000 non-null  int64  
 6   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 5.3+ MB
None
   Age  Gender     Location  Subscription_Length_Months  Monthly_Bill  \
0   63    Male  Los Angeles                          17         73.36   
1   62  Female     New York                           1         48.76   
2   24  Female  Los Angeles                           5 

In [3]:
# Calculate Customer Tenure
# You can create a new feature representing the tenure of each customer, which is the inverse of Subscription_Length_Months.
data['CustomerTenure'] = 1 / data['Subscription_Length_Months']

# Calculate Average Usage Per Month
# Create a new feature representing the average monthly data usage per customer.
data['Average_Usage_Per_Month'] = data['Total_Usage_GB'] / data['Subscription_Length_Months']

# Drop the original 'Subscription_Length_Months' and 'Total_Usage_GB' columns since we have derived new features
data.drop(['Subscription_Length_Months', 'Total_Usage_GB'], axis=1, inplace=True)

# Now, the dataset includes the new features 'CustomerTenure' and 'Average_Usage_Per_Month'
print(data.head())

   Age  Monthly_Bill  Churn  Gender_Male  Location_Houston  \
0   63         73.36      0         True             False   
1   62         48.76      0        False             False   
2   24         85.47      0        False             False   
3   36         97.94      1        False             False   
4   46         58.14      0        False             False   

   Location_Los Angeles  Location_Miami  Location_New York  CustomerTenure  \
0                  True           False              False        0.058824   
1                 False           False               True        1.000000   
2                  True           False              False        0.200000   
3                 False            True              False        0.333333   
4                 False            True              False        0.052632   

   Average_Usage_Per_Month  
0                13.882353  
1               172.000000  
2                92.000000  
3                99.000000  
4            

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the preprocessed dataset
data = pd.read_csv('customer_churn_large_dataset.csv')  
# Remove columns that are not relevant for modeling
data.drop(['CustomerID', 'Name'], axis=1, inplace=True)

# Split the data into features (X) and the target variable (y)
X = data.drop('Churn', axis=1)
y = data['Churn']

# Encode Categorical Variables (One-Hot Encoding)
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_cols = ['Gender', 'Location']  
X_encoded = encoder.fit_transform(X[categorical_cols])

# Replace the original categorical columns with the one-hot encoded versions
X = X.drop(categorical_cols, axis=1)
X = pd.concat([X, pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building: Logistic Regression (You can replace this with other models)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# Print model evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")





Accuracy: 0.50
Precision: 0.50
Recall: 0.38
F1 Score: 0.43
ROC AUC: 0.50


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the preprocessed dataset
data = pd.read_csv('customer_churn_large_dataset.csv') 
# Remove columns that are not relevant for modeling
data.drop(['CustomerID', 'Name'], axis=1, inplace=True)

# Split the data into features (X) and the target variable (y)
X = data.drop('Churn', axis=1)
y = data['Churn']

# Encode Categorical Variables (One-Hot Encoding)
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_cols = ['Gender', 'Location']  
X_encoded = encoder.fit_transform(X[categorical_cols])

# Replace the original categorical columns with the one-hot encoded versions
X = X.drop(categorical_cols, axis=1)
X = pd.concat([X, pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building: Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# Print model evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")




Accuracy: 0.49
Precision: 0.49
Recall: 0.47
F1 Score: 0.48
ROC AUC: 0.49


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the preprocessed dataset
data = pd.read_csv('customer_churn_large_dataset.csv')  
# Remove columns that are not relevant for modeling
data.drop(['CustomerID', 'Name'], axis=1, inplace=True)
# Split the data into features (X) and the target variable (y)
X = data.drop('Churn', axis=1)
y = data['Churn']

# Encode Categorical Variables (One-Hot Encoding)
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_cols = ['Gender', 'Location'] 
X_encoded = encoder.fit_transform(X[categorical_cols])

# Replace the original categorical columns with the one-hot encoded versions
X = X.drop(categorical_cols, axis=1)
X = pd.concat([X, pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building: Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, gb_model.predict_proba(X_test)[:, 1])

# Print model evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")




Accuracy: 0.50
Precision: 0.50
Recall: 0.45
F1 Score: 0.47
ROC AUC: 0.50


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the preprocessed dataset
data = pd.read_csv('customer_churn_large_dataset.csv') 
# Remove columns that are not relevant for modeling
data.drop(['CustomerID', 'Name'], axis=1, inplace=True)

# Split the data into features (X) and the target variable (y)
X = data.drop('Churn', axis=1)
y = data['Churn']

# Encode Categorical Variables (One-Hot Encoding)
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_cols = ['Gender', 'Location']  
X_encoded = encoder.fit_transform(X[categorical_cols])

# Replace the original categorical columns with the one-hot encoded versions
X = X.drop(categorical_cols, axis=1)
X = pd.concat([X, pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building: XGBoost Classifier
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

# Print model evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")




  if is_sparse(data):


Accuracy: 0.50
Precision: 0.50
Recall: 0.49
F1 Score: 0.49
ROC AUC: 0.50


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# Load the preprocessed dataset
data = pd.read_csv('customer_churn_large_dataset.csv')  # Replace with the path to your preprocessed dataset
# Remove columns that are not relevant for modeling
data.drop(['CustomerID', 'Name'], axis=1, inplace=True)

# Split the data into features (X) and the target variable (y)
X = data.drop('Churn', axis=1)
y = data['Churn']

# Encode Categorical Variables (Label Encoding)
label_encoder = LabelEncoder()
categorical_cols = ['Gender', 'Location']  
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building: Neural Network
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=1)

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Print model evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [5]:
# Save the trained model to a file
model.save('customer_churn_nn_model.h5')


  saving_api.save_model(
