In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Data Aggregation
# Create a sample customer dataset
np.random.seed(42)
data = {
    'customer_id': range(1000),
    'age': np.random.randint(18, 80, 1000),
    'tenure': np.random.randint(1, 10, 1000),
    'churn': np.random.choice([0, 1], 1000, p=[0.8, 0.2])
}
df = pd.DataFrame(data)

# Aggregate data to find average age and tenure for churned and non-churned customers
churn_agg = df.groupby('churn').agg({
    'age': 'mean',
    'tenure': 'mean'
}).reset_index()

print("Aggregated Data:")
print(churn_agg)

# 2. Data Splitting
# Split the dataset into features (X) and target variable (y)
X = df[['age', 'tenure']]
y = df['churn']

# Use train_test_split to divide data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData Split Sizes:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# 3. Model Training
# Choose Logistic Regression as the classification algorithm
model = LogisticRegression(random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Evaluate model performance on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.4f}")

Aggregated Data:
   churn        age    tenure
0      0  49.929630  4.981481
1      1  49.547368  4.784211

Data Split Sizes:
Training set: 800 samples
Testing set: 200 samples

Model Performance:
Accuracy: 0.8250
