In [17]:
# Milestone 1: Project Predictive Model
# Objective:
# Develop a machine learning model that predicts the likelihood of a customer making a purchase using the provided customer, product, and transaction datasets.

In [19]:
# Import Libraries
import pandas as pd

In [60]:
# Step 1: Load and Explore the Datasets
# Before we can build any predictive model, we must first understand the structure and contents of the data. This is done by loading the datasets and conducting an initial exploratory data analysis (EDA).

# Load datasets
df1 = pd.read_csv('/Users/atrabaja/Documents/FinMark_MachineLearning/customernew.csv')
df2 = pd.read_csv('/Users/atrabaja/Documents/FinMark_MachineLearning/productnew.csv')
df3 = pd.read_csv('/Users/atrabaja/Documents/FinMark_MachineLearning/transactionsnew.csv')

In [25]:
# Initial Exploration
# We'll inspect the first few rows and check for missing values to understand the dataset's completeness and structure.

# Preview datasets
print(df1.head())
print(df2.head())
print(df3.head())

# Check for missing values
print(df1.isnull().sum())
print(df2.isnull().sum())
print(df3.isnull().sum())

   Company_ID        Company_Name  Company_Profit    City
0           1    Tech Enterprises         80701.0   Pasig
1           2     Global Partners         80511.0  Taguig
2           3  Quantum Associates        110664.0   Pasig
3           4       Prime Network             NaN  Taguig
4           5      Elite Ventures         69427.0  Makati
   Product_ID            Product_Name Product_Price
0         1.0      FinPredictor Suite       140,000
1         2.0  MarketMinder Analytics       168,000
2         3.0    TrendWise Forecaster       100,800
3         4.0  CustomerScope Insights       123,200
4         5.0     SalesSync Optimizer        84,000
   Transaction_ID  Company_ID  Product_ID Quantity Transaction_Date  \
0             1.0        88.0         6.0        6       26-03-2024   
1             2.0        29.0        19.0       15       09-07-2024   
2             NaN        28.0        18.0        7       13-04-2024   
3             4.0        85.0        12.0  #DIV/0!      

In [34]:
# Step 2: Data Cleaning & Preprocessing
# Data cleaning ensures the data is in a suitable format for analysis and modeling. This involves handling missing values, fixing data types, and standardizing formats.

# Handling Missing Values
# Missing values can lead to incorrect model predictions or errors during model training. Depending on the nature and significance of the missing data, we can either fill them with statistical values (mean, median) or remove the affected rows.

# For numerical columns like Company_Profit:
# Replace missing values with the median profit. Median is preferred over mean in case of outliers.
df1['Company_Profit'] = df1['Company_Profit'].fillna(df1['Company_Profit'].median())

# For transactions missing Transaction_ID:
# Transactions without an ID are incomplete and likely erroneous, so we drop them.
df3.dropna(subset=['Transaction_ID'], inplace=True)

In [36]:
# Correcting Data Types
# Ensuring data types are consistent is crucial for accurate calculations and modeling. For example, Transaction_Date should be in datetime format, and product prices should be numerical.

# Convert Transaction_Date to datetime format:
# This will allow us to extract time-based features like purchase frequency over months.
df3['Transaction_Date'] = pd.to_datetime(df3['Transaction_Date'], format='%d-%m-%Y')

# Convert Product_Price to float:
# Remove commas from price strings and convert them to numeric values.
df2['Product_Price'] = df2['Product_Price'].str.replace(',', '').astype(float)

In [38]:
# Standardizing Formats Across Datasets
# Consistent column names and formats are essential when merging datasets or conducting analysis.

# Standardize the Company_ID field across datasets:
# This ensures we can merge datasets without errors.
df1.rename(columns={'Company_ID': 'CompanyID'}, inplace=True)
df3.rename(columns={'Company_ID': 'CompanyID'}, inplace=True)

In [40]:
# Feature Engineering
# Feature engineering is the process of creating new features from the existing data to improve the performance of machine learning models.

# Customer Purchase History Features
# Understanding a customer’s purchase behavior is key to predicting future purchases. We derive features like total spending, average spending, and the number of transactions per customer..

# Total Spending Per Company:
# This helps identify big spenders who might be more likely to purchase again.
customer_spending = df3.groupby('CompanyID')['Total_Cost'].sum().reset_index()
customer_spending.rename(columns={'Total_Cost': 'Total_Spending'}, inplace=True)

# Average Spending Per Transaction:
# Companies with high average spending per transaction may have higher purchasing power.
avg_spending = df3.groupby('CompanyID')['Total_Cost'].mean().reset_index()
avg_spending.rename(columns={'Total_Cost': 'Avg_Spending'}, inplace=True)

# Number of Transactions Per Company:
# More frequent transactions may indicate a loyal customer.
transaction_count = df3.groupby('CompanyID')['Transaction_ID'].count().reset_index()
transaction_count.rename(columns={'Transaction_ID': 'Transaction_Count'}, inplace=True)

# Merge All Features Into Customer Dataset:
df1 = df1.merge(customer_spending, on='CompanyID', how='left')
df1 = df1.merge(transaction_count, on='CompanyID', how='left')
df1 = df1.merge(avg_spending, on='CompanyID', how='left')

In [42]:
# Time-Based Features
# Time-related behaviors, like purchasing trends over certain months, can indicate future buying intentions.

# Extract Month and Year from Transaction_Date:
df3['Transaction_Year'] = df3['Transaction_Date'].dt.year
df3['Transaction_Month'] = df3['Transaction_Date'].dt.month


In [44]:
# Step 4: Building the Predictive Model
# Now that the data is prepared, we can build a machine learning model to predict customer purchases.

# Define the Target Variable
# To predict whether a customer will make a purchase, we need to create a binary target variable.

# Create Has_Purchased:
# If a customer has made any transactions, label them as 1 (purchased); otherwise, 0 (not purchased).
df1['Has_Purchased'] = df1['Transaction_Count'].apply(lambda x: 1 if x > 0 else 0)

In [46]:
# Select Features and Prepare Data for Modeling
# Select features that are relevant for predicting purchase behavior.

# Define Feature Set and Target Variable:
features = ['Company_Profit', 'Total_Spending', 'Avg_Spending', 'Transaction_Count']
target = 'Has_Purchased'

X = df1[features].fillna(0)  # Replace NaN with 0
y = df1[target]

# Split Data into Training and Testing Sets:
# We’ll use 80% of the data for training and 20% for testing.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# Train the Model
# We’ll use the Random Forest Classifier, a robust algorithm for classification problems.

# Train Random Forest Model:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make Predictions and Evaluate Model Accuracy:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 1.00


In [50]:
# Step 5: Model Evaluation
# Evaluating the model helps us understand its performance and areas for improvement.

# Generate Confusion Matrix and Classification Report:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Confusion Matrix: Shows the true positives, true negatives, false positives, and false negatives.
# Precision, Recall, F1-Score: Provides deeper insight into model performance.

[[20]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20





In [56]:
#  Ensure Both Classes Are Represented in Test Set
# Ensure the data splitting process includes samples from both classes (0 and 1) in the test set using stratification.
from sklearn.model_selection import train_test_split

# Ensure stratified split to maintain class distribution in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Add labels Parameter in the Confusion Matrix
# Specify the class labels in the confusion matrix.
from sklearn.metrics import classification_report, confusion_matrix

# Explicitly define all possible class labels
labels = [0, 1]  # Assuming 0 = No Purchase, 1 = Purchase

print(confusion_matrix(y_test, y_pred, labels=labels))
print(classification_report(y_test, y_pred, labels=labels))

[[ 0  0]
 [ 0 20]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       0.50      0.50      0.50        20
weighted avg       1.00      1.00      1.00        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [58]:
# Check class distribution
print(y.value_counts())

Has_Purchased
1    100
Name: count, dtype: int64


In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Load and Explore the Datasets
df1 = pd.read_csv('/Users/atrabaja/Documents/FinMark_MachineLearning/customernew.csv', encoding='ISO-8859-1')
df2 = pd.read_csv('/Users/atrabaja/Documents/FinMark_MachineLearning/productnew.csv', encoding='ISO-8859-1')
df3 = pd.read_csv('/Users/atrabaja/Documents/FinMark_MachineLearning/transactionsnew.csv', encoding='ISO-8859-1')

# Step 2: Data Cleaning & Preprocessing

# Handle missing values in Company_Profit
df1['Company_Profit'] = df1['Company_Profit'].fillna(df1['Company_Profit'].median())

# Remove rows with missing Product_ID
df2.dropna(subset=['Product_ID'], inplace=True)

# Convert Product_Price to numeric
df2['Product_Price'] = df2['Product_Price'].str.replace(',', '').astype(float)

# Clean Quantity column in transactions
df3['Quantity'] = pd.to_numeric(df3['Quantity'], errors='coerce')
df3.dropna(subset=['Quantity'], inplace=True)

# Convert Transaction_Date to datetime format
df3['Transaction_Date'] = pd.to_datetime(df3['Transaction_Date'], format='%d-%m-%Y')

# Standardize Company_ID naming across datasets
df1.rename(columns={'Company_ID': 'CompanyID'}, inplace=True)
df3.rename(columns={'Company_ID': 'CompanyID'}, inplace=True)

# Step 3: Feature Engineering

# Total spending per company
customer_spending = df3.groupby('CompanyID')['Total_Cost'].sum().reset_index()
customer_spending.rename(columns={'Total_Cost': 'Total_Spending'}, inplace=True)

# Average spending per transaction
avg_spending = df3.groupby('CompanyID')['Total_Cost'].mean().reset_index()
avg_spending.rename(columns={'Total_Cost': 'Avg_Spending'}, inplace=True)

# Number of transactions per company
transaction_count = df3.groupby('CompanyID')['Transaction_ID'].count().reset_index()
transaction_count.rename(columns={'Transaction_ID': 'Transaction_Count'}, inplace=True)

# Merge features with customer data
df1 = df1.merge(customer_spending, on='CompanyID', how='left')
df1 = df1.merge(transaction_count, on='CompanyID', how='left')
df1 = df1.merge(avg_spending, on='CompanyID', how='left')

# Create target variable Has_Purchased
df1['Has_Purchased'] = df1['Transaction_Count'].apply(lambda x: 1 if x > 0 else 0)

# Step 4: Building the Predictive Model

# Select features and target
features = ['Company_Profit', 'Total_Spending', 'Avg_Spending', 'Transaction_Count']
target = 'Has_Purchased'

X = df1[features].fillna(0)
y = df1[target]

# Split data with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Step 5: Model Evaluation

# Evaluate using confusion matrix and classification report
labels = [0, 1]  # Ensure both classes are represented
print(confusion_matrix(y_test, y_pred, labels=labels))
print(classification_report(y_test, y_pred, labels=labels))


[[ 0  0]
 [ 0 20]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       0.50      0.50      0.50        20
weighted avg       1.00      1.00      1.00        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
