<a href="https://colab.research.google.com/github/aaron123908/Module2/blob/main/Coding_Exercise_ML_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

np.random.seed(42)
n_samples = 200

data = {
    'square_footage': np.random.randint(800, 4500, n_samples),
    'location': np.random.choice(['Downtown', 'Suburb', 'Rural', 'Midtown'], n_samples),
}

location_multipliers = {'Downtown': 1.5, 'Midtown': 1.2, 'Suburb': 1.0, 'Rural': 0.8}
base_price_per_sqft = 150
noise = np.random.normal(0, 25000, n_samples) # Adding random variance

df = pd.DataFrame(data)
df['price'] = (df['square_footage'] * base_price_per_sqft * df['location'].map(location_multipliers) + noise)

X = df[['square_footage', 'location']]
y = df['price']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), ['location']),
        ('num', StandardScaler(), ['square_footage'])
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Model R² Score: {r2_score(y_test, y_pred):.3f}")
print(f"Mean Absolute Error: ${mean_absolute_error(y_test, y_pred):,.2f}")

new_house = pd.DataFrame({'square_footage': [2000], 'location': ['Downtown']})
predicted_price = model.predict(new_house)
print(f"\nPredicted price for a 2000 sq ft house in Downtown: ${predicted_price[0]:,.2f}")

cat_features = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(['location']).tolist()
feature_names = cat_features + ['square_footage']
coefficients = model.named_steps['regressor'].coef_

print("\nModel Coefficients (Impact on Price):")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:,.2f}")

Model R² Score: 0.970
Mean Absolute Error: $33,169.61

Predicted price for a 2000 sq ft house in Downtown: $505,454.26

Model Coefficients (Impact on Price):
location_Midtown: -138,810.16
location_Rural: -288,862.39
location_Suburb: -216,614.52
square_footage: 176,617.96


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

np.random.seed(42)
n_samples = 500

data = {
    'age': np.random.randint(18, 80, n_samples),
    'monthly_usage_hours': np.random.uniform(5, 100, n_samples),
    'purchase_amount': np.random.uniform(20, 500, n_samples),
    'customer_service_calls': np.random.randint(0, 10, n_samples),
    'region': np.random.choice(['North', 'South', 'West', 'East'], n_samples)
}

df = pd.DataFrame(data)

logit = (0.02 * df['age'] - 0.05 * df['monthly_usage_hours'] +
         0.4 * df['customer_service_calls'] - 2.0)
prob = 1 / (1 + np.exp(-logit))
df['churn'] = (prob > np.random.rand(n_samples)).astype(int)

X = df.drop('churn', axis=1)
y = df['churn']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'monthly_usage_hours', 'purchase_amount', 'customer_service_calls']),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), ['region'])
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Model Performance Report:")
print(classification_report(y_test, y_pred))

new_customer = pd.DataFrame({
    'age': [35],
    'monthly_usage_hours': [20],
    'purchase_amount': [150],
    'customer_service_calls': [5],
    'region': ['West']
})

prob = model.predict_proba(new_customer)[0][1]
print(f"Churn Probability for new customer: {prob:.2%}")

cat_features = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(['region']).tolist()
num_features = ['age', 'monthly_usage_hours', 'purchase_amount', 'customer_service_calls']
feature_names = num_features + cat_features
coefficients = model.named_steps['classifier'].coef_[0]

print("\nFeature Impact (Coefficients):")
for name, coef in zip(feature_names, coefficients):
    print(f"{name:>25}: {coef:.3f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

np.random.seed(42)
n_samples = 300

c1 = {
    'annual_spending': np.random.normal(1500, 200, 100),
    'purchase_frequency': np.random.normal(20, 3, 100),
    'age': np.random.normal(25, 5, 100)
}

c2 = {
    'annual_spending': np.random.normal(800, 150, 100),
    'purchase_frequency': np.random.normal(8, 2, 100),
    'age': np.random.normal(55, 8, 100)
}

c3 = {
    'annual_spending': np.random.normal(300, 100, 100),
    'purchase_frequency': np.random.normal(4, 1.5, 100),
    'age': np.random.normal(35, 10, 100)
}

df = pd.concat([pd.DataFrame(c1), pd.DataFrame(c2), pd.DataFrame(c3)]).sample(frac=1).reset_index(drop=True)
df['region'] = np.random.choice(['North', 'South', 'West', 'East'], n_samples)

features = ['annual_spending', 'purchase_frequency', 'age']
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertia = []
K_range = range(1, 11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

sil_score = silhouette_score(X_scaled, df['cluster'])
print(f"Silhouette Score for K=3: {sil_score:.3f}")

cluster_summary = df.groupby('cluster')[features].mean().round(2)
print("\nCluster Characteristics (Averages):")
print(cluster_summary)

print("\n--- Strategic Recommendations ---")
for cluster in range(optimal_k):
    avg_spend = cluster_summary.loc[cluster, 'annual_spending']
    avg_freq = cluster_summary.loc[cluster, 'purchase_frequency']
    avg_age = cluster_summary.loc[cluster, 'age']

    strategy = f"Cluster {cluster} (Avg Age {avg_age}): "
    if avg_spend > 1200:
        strategy += "VIP Segment. Focus on premium retention and early access."
    elif avg_freq > 15:
        strategy += "High Engagement. Offer subscription models to lock in value."
    elif avg_spend < 500:
        strategy += "Price Sensitive. Use discount-driven re-engagement."
    else:
        strategy += "Steady Customers. Standard marketing mix."
    print(strategy)