<a href="https://colab.research.google.com/github/aleenamilburn/ScrumShank-Redemption/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset
df = pd.read_csv('/content/MarchData.xlsx - cleaned.csv')

# Feature Engineering
# 1. Approval Delay: Difference between Requisition Submitted Date and Requisition Approved Date
df['Req Submitted'] = pd.to_datetime(df['Requisition.Submitted.Date'])
df['Req Approved'] = pd.to_datetime(df['Requisition.Approved.Date'])
df['Approval Delay'] = (df['Req Approved'] - df['Req Submitted']).dt.days

# 2. Cost Overrun: Line Total Change (assuming 'Line.Total.Change' is already in the dataset)
df['Cost Overrun'] = df['Line.Total.Change']

# 3. Fulfillment Delay: Difference between Requisition Approved Date and Ordered Date
df['Ordered Date'] = pd.to_datetime(df['Ordered.Date'])
df['Fulfillment Delay'] = (df['Ordered Date'] - df['Req Approved']).dt.days

# 4. SWAM Compliance Violations: Count of SWAM variables that are FALSE (0)
swam_columns = ['SWAM.Minority', 'SWAM.Woman', 'SWAM.Small', 'SWAM.Micro.Business']
df['SWAM Compliance Violations'] = df[swam_columns].apply(lambda row: sum(row == 0), axis=1)

# Define the target variable: Supplier Risk (assuming we have a target column 'Supplier Risk')
# If not, you can create a target variable based on some criteria, e.g., if Approval Delay > 10 days, Cost Overrun > 0, etc.
# For this example, let's assume 'Supplier Risk' is already in the dataset.
# If not, you can create it like this:
df['Supplier Risk'] = np.where((df['Approval Delay'] > 10) | (df['Cost Overrun'] > 0) | (df['Fulfillment Delay'] > 5) | (df['SWAM Compliance Violations'] > 0), 1, 0)

# Select features and target
features = ['Approval Delay', 'Cost Overrun', 'Fulfillment Delay', 'SWAM Compliance Violations']
X = df[features]
y = df['Supplier Risk']

# Encode categorical variables if any (not needed in this case as all features are numerical)
# label_encoder = LabelEncoder()
# for col in X.select_dtypes(include=['object']).columns:
#     X[col] = label_encoder.fit_transform(X[col])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
print("\nFeature Importances:")
for feature, importance in zip(features, clf.feature_importances_):
    print(f"{feature}: {importance:.4f}")


Accuracy: 0.9998666666666667

Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      1.00      1.00      7489

    accuracy                           1.00      7500
   macro avg       0.96      1.00      0.98      7500
weighted avg       1.00      1.00      1.00      7500


Feature Importances:
Approval Delay: 0.0000
Cost Overrun: 0.0000
Fulfillment Delay: 0.0000
SWAM Compliance Violations: 1.0000
