In [20]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate simulated data
num_records = 1000
data = {
    'CustomerID': range(1, num_records + 1),
    'Age': np.random.randint(18, 70, num_records),
    'Tenure': np.random.randint(1, 72, num_records),
    'ServiceTier': np.random.choice([1, 2, 3], num_records),
    'MonthlyCharge': np.random.uniform(30, 120, num_records),
    'TotalCharges': lambda x: x['MonthlyCharge'] * x['Tenure'],  # Placeholder for post-generation calculation
    'NumSupportTickets': np.random.poisson(2, num_records),
    'Churn': np.random.choice([0, 1], num_records, p=[0.7, 0.3]),
}

df = pd.DataFrame(data)
df['TotalCharges'] = df['MonthlyCharge'] * df['Tenure']  # Calculate TotalCharges

# Save to CSV
csv_file_path = 'customer_churn_data.csv'
df.to_csv(csv_file_path, index=False)

csv_file_path


'customer_churn_data.csv'

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the data
df = pd.read_csv('customer_churn_data.csv')

# Basic EDA
print(df.head())
print(df.describe())

   CustomerID  Age  Tenure  ServiceTier  MonthlyCharge  TotalCharges  \
0           1   56      15            2      55.203360    828.050395   
1           2   69      64            3     107.250751   6864.048058   
2           3   46      28            2      56.299641   1576.389941   
3           4   32      39            2     111.969315   4366.803267   
4           5   60      57            1      97.856571   5577.824565   

   NumSupportTickets  Churn  
0                  4      0  
1                  1      0  
2                  2      1  
3                  1      0  
4                  3      0  
        CustomerID         Age      Tenure  ServiceTier  MonthlyCharge  \
count  1000.000000  1000.00000  1000.00000  1000.000000    1000.000000   
mean    500.500000    43.81900    35.45900     1.982000      74.782397   
std     288.819436    14.99103    20.36819     0.803942      26.066775   
min       1.000000    18.00000     1.00000     1.000000      30.001047   
25%     250.75000

In [22]:
# Define features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Splitting dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical features
numeric_features = ['Age', 'Tenure', 'MonthlyCharge', 'TotalCharges', 'NumSupportTickets']
numeric_transformer = StandardScaler()

# Preprocessing for categorical features
categorical_features = ['ServiceTier']
categorical_transformer = OneHotEncoder()

# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [23]:
# Creating a pipeline that first preprocesses the data and then trains the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# Training the model
pipeline.fit(X_train, y_train)

In [24]:
# Predicting on the test set
y_pred = pipeline.predict(X_test)

# Evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.89      0.77       140
           1       0.16      0.05      0.08        60

    accuracy                           0.64       200
   macro avg       0.42      0.47      0.42       200
weighted avg       0.53      0.64      0.56       200

[[124  16]
 [ 57   3]]


In [11]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate simulated data
num_properties = 1000
locations = ['Downtown', 'Suburb', 'Rural']
data = {
    'PropertyID': range(1, num_properties + 1),
    'Location': np.random.choice(locations, num_properties),
    'Size_sqft': np.random.uniform(800, 5000, num_properties),
    'Bedrooms': np.random.randint(1, 6, num_properties),
    'Bathrooms': np.random.randint(1, 4, num_properties),
    'Age_years': np.random.randint(0, 100, num_properties),
    'Amenities': np.random.randint(0, 5, num_properties),
    'MarketValue': np.random.uniform(100, 1500, num_properties)  # Market value in thousands of dollars
}

df = pd.DataFrame(data)

# Save to CSV
csv_file_path = 'real_estate_data.csv'
df.to_csv(csv_file_path, index=False)

csv_file_path


'real_estate_data.csv'

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Separate features and target
X = df_real_estate.drop('MarketValue', axis=1)
y = df_real_estate['MarketValue']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Encode categorical variables
categorical_features = ['Location']
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)
], remainder='passthrough')

# Define the XGBoost regressor model
xgb_model = XGBRegressor(n_estimators=50, random_state=42, verbosity=0)  # Configuration adjusted for quicker execution

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

NameError: name 'df_real_estate' is not defined

In [16]:
import pandas as pd
import numpy as np

np.random.seed(42)  # For reproducibility

# Generate simulated data for 1000 customers
num_customers = 1000
data = {
    'Age': np.random.randint(18, 70, num_customers),
    'AnnualIncome_k$': np.random.randint(20, 120, num_customers),
    'SpendingScore': np.random.randint(1, 100, num_customers),
    'TotalPurchases': np.random.randint(1, 50, num_customers),
    'DaysSinceLastPurchase': np.random.randint(1, 365, num_customers)
}
df_customers = pd.DataFrame(data)

In [17]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming our dataframe is titled "df_customers"
# Normalize the data
scaler = StandardScaler()
df_normalized = scaler.fit_transform(df_customers)

# Apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(df_normalized)
df_customers['Cluster'] = clusters

# Calculate cluster means and global means
cluster_means = df_customers.groupby('Cluster').mean()
global_means = df_customers.drop('Cluster', axis=1).mean()

# Calculate relative importance of cluster features relative to global means
relative_importance = cluster_means / global_means - 1

# Plotting the heatmap with reversed axes: Clusters on the x-axis and features on the y-axis
plt.figure(figsize=(12, 6))
sns.heatmap(relative_importance.T, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Importance by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Feature')
plt.yticks(rotation=0)  # Ensure feature names are horizontally aligned for better readability
plt.show()



AttributeError: 'NoneType' object has no attribute 'split'

In [18]:
df_normalized

array([[ 0.81295917,  1.68172736,  0.98524506, -0.39389617,  0.34068005],
       [ 1.68057836, -1.19935231,  1.12249928, -1.02278839, -0.55649256],
       [ 0.1455598 ,  0.4812775 ,  0.8136773 , -0.67340382, -1.17034751],
       ...,
       [ 1.2133988 ,  0.75566604, -1.17650881,  0.37474989, -0.07485253],
       [-0.5885795 , -1.33654658, -0.49023774,  0.58438063, -1.45366518],
       [ 0.74621924,  0.4812775 ,  0.16171978,  0.93376521,  0.68066125]])