In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [49]:
# Load the original datasets
social_profiles_df = pd.read_csv('customer_social_profiles.csv')
transactions_df = pd.read_csv('customer_transactions.csv')

# Correctly convert key columns to string type for comparison
social_profiles_df['customer_id_new'] = social_profiles_df['customer_id_new'].astype(str)
transactions_df['customer_id_legacy'] = transactions_df['customer_id_legacy'].astype(str)

# Check for common IDs
social_ids = set(social_profiles_df['customer_id_new'])
transaction_ids = set(transactions_df['customer_id_legacy'])
common_ids = social_ids.intersection(transaction_ids)

print("--- Data Integrity Check ---")
if not common_ids:
    print("✅ Confirmed: There are no common customer IDs between the two files.")
    print("We will proceed by creating a simulated dataset.\n")
else:
    print(f"Found {len(common_ids)} common IDs. Direct merge is possible.")

--- Data Integrity Check ---
✅ Confirmed: There are no common customer IDs between the two files.
We will proceed by creating a simulated dataset.



In [50]:
# We'll use the social profiles as our base
merged_df = social_profiles_df.copy()

# --- NEW: Define a function to create logical patterns ---
def assign_product(row):
    if row['social_media_platform'] == 'LinkedIn' and row['purchase_interest_score'] > 4.0:
        return 'Electronics'
    elif (row['social_media_platform'] == 'TikTok' or row['social_media_platform'] == 'Facebook') and row['engagement_score'] > 75:
        return 'Clothing'
    elif row['review_sentiment'] == 'Positive' and row['engagement_score'] > 70:
        return 'Home Goods'
    elif row['purchase_interest_score'] < 2.5:
        return 'Books'
    else:
        return 'Sports'

# Apply the function to create the target variable
merged_df['product'] = merged_df.apply(assign_product, axis=1)

# Engineer the 'engagement_level' feature
bins = [0, 50, 80, 101]
labels = ['Low', 'Medium', 'High']
merged_df['engagement_level'] = pd.cut(merged_df['engagement_score'], bins=bins, labels=labels, right=False)

# Rename the customer_id column
merged_df.rename(columns={'customer_id_new': 'customer_id'}, inplace=True)

print("✅ Simulated dataset with intentional patterns created successfully.")
display(merged_df.head())

✅ Simulated dataset with intentional patterns created successfully.


Unnamed: 0,customer_id,social_media_platform,engagement_score,purchase_interest_score,review_sentiment,product,engagement_level
0,A178,LinkedIn,74,4.9,Positive,Electronics,Medium
1,A190,Twitter,82,4.8,Neutral,Sports,High
2,A150,Facebook,96,1.6,Positive,Clothing,High
3,A162,Twitter,89,2.6,Positive,Home Goods,High
4,A197,Twitter,92,2.3,Neutral,Books,High


In [51]:
# Save the final merged and cleaned dataframe
merged_df.to_csv("merged_customer_data.csv", index=False)

print("Clean, merged dataset saved to 'merged_customer_data.csv'.")

Clean, merged dataset saved to 'merged_customer_data.csv'.


In [52]:
# Select features, including our new engineered feature
features = ['engagement_score', 'purchase_interest_score', 'review_sentiment', 'social_media_platform', 'engagement_level']
target = 'product'

X = merged_df[features]
y = merged_df[target]

# Encode categorical features
X = pd.get_dummies(X, columns=['review_sentiment', 'social_media_platform', 'engagement_level'], drop_first=True)

# Encode the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print("Data preprocessed and ready for modeling.")

Data preprocessed and ready for modeling.


In [53]:
# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced']
}

# Set up the grid search with 3-fold cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2
)

# Fit the grid search to the data
print("--- Starting Hyperparameter Tuning ---")
grid_search.fit(X_train, y_train)

print("\n--- Tuning Complete ---")
print("Best parameters found: ", grid_search.best_params_)

--- Starting Hyperparameter Tuning ---
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimators=50; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=2, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=2, n_estimators=50; total time=   0.0s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   0.1s
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=2, n_estimators=50

In [54]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Tuned Model Accuracy: {accuracy:.2f}")
print(f"Tuned Model F1-Score: {f1:.2f}\n")

# Print the classification report
y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test)

print("--- Tuned Model Classification Report ---")
print(classification_report(y_test_labels, y_pred_labels))

Tuned Model Accuracy: 0.77
Tuned Model F1-Score: 0.74

--- Tuned Model Classification Report ---
              precision    recall  f1-score   support

       Books       0.75      0.75      0.75         4
    Clothing       0.86      0.86      0.86         7
 Electronics       1.00      1.00      1.00         1
  Home Goods       0.67      0.29      0.40         7
      Sports       0.75      1.00      0.86        12

    accuracy                           0.77        31
   macro avg       0.80      0.78      0.77        31
weighted avg       0.76      0.77      0.74        31



In [None]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Tuned Model Accuracy: {accuracy:.2f}")
print(f"Tuned Model F1-Score: {f1:.2f}\n")

# Print the classification report
y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test)

print("--- Tuned Model Classification Report ---")
print(classification_report(y_test_labels, y_pred_labels))

Tuned Model Accuracy: 0.77
Tuned Model F1-Score: 0.74

--- Tuned Model Classification Report ---
              precision    recall  f1-score   support

       Books       0.75      0.75      0.75         4
    Clothing       0.86      0.86      0.86         7
 Electronics       1.00      1.00      1.00         1
  Home Goods       0.67      0.29      0.40         7
      Sports       0.75      1.00      0.86        12

    accuracy                           0.77        31
   macro avg       0.80      0.78      0.77        31
weighted avg       0.76      0.77      0.74        31

