In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf

# Import our input dataset
test_df = pd.read_csv('project_cleaned_input.csv')
test_df.head()

Unnamed: 0,project_id,service_line,team,project_manager,overbudget_yes,budget,building_region,nda,total_hours,emp_count,month_duration
0,c0a1725b-89f9-e811-a966-000d3a37839b,Commissioning,SB,51,0,21400.0,SF,False,114.0,38.0,9.199368
1,cffdaaf0-a896-e811-a957-000d3a378ca2,Commissioning,SB,51,0,31865.0,SF,False,162.75,108.0,8.640835
2,b3ed8234-2775-e611-80ea-5065f38b4491,Commissioning,SB,51,0,36610.0,SF,False,84.5,60.0,10.743547
3,e4344949-aee8-e611-80f9-5065f38b4491,Commissioning,SB,51,1,8170.0,SF,False,54.5,35.0,6.899526
4,dda37c80-7e6e-e411-8b85-6c3be5a6aa7c,Commissioning,SB,51,0,20185.0,SF,False,148.0,94.0,27.400973


In [3]:
test_df.dtypes

project_id          object
service_line        object
team                object
project_manager      int64
overbudget_yes       int64
budget             float64
building_region     object
nda                   bool
total_hours        float64
emp_count          float64
month_duration     float64
dtype: object

In [4]:

#df_ages['age_bins'] = pd.cut(x=df_ages['age'], bins=[20, 29, 39, 49])

test_df['budget_bins'] = pd.cut(x=test_df['budget'],bins=[4999,9999,14999,19999,24999,29999,34999,39999,44999,49999,59999,69999,79999,89999,99999,500000])
# dropna
dropna_df = test_df.dropna()

dropna_df=dropna_df.drop(columns=['nda'])

dropna_df.head()

Unnamed: 0,project_id,service_line,team,project_manager,overbudget_yes,budget,building_region,total_hours,emp_count,month_duration,budget_bins
0,c0a1725b-89f9-e811-a966-000d3a37839b,Commissioning,SB,51,0,21400.0,SF,114.0,38.0,9.199368,"(19999, 24999]"
1,cffdaaf0-a896-e811-a957-000d3a378ca2,Commissioning,SB,51,0,31865.0,SF,162.75,108.0,8.640835,"(29999, 34999]"
2,b3ed8234-2775-e611-80ea-5065f38b4491,Commissioning,SB,51,0,36610.0,SF,84.5,60.0,10.743547,"(34999, 39999]"
3,e4344949-aee8-e611-80f9-5065f38b4491,Commissioning,SB,51,1,8170.0,SF,54.5,35.0,6.899526,"(4999, 9999]"
4,dda37c80-7e6e-e411-8b85-6c3be5a6aa7c,Commissioning,SB,51,0,20185.0,SF,148.0,94.0,27.400973,"(19999, 24999]"


In [5]:
dropna_df.shape

(616, 11)

In [6]:
#test_list = ["project_manager","service_line", "team", "building_region","nda","budget_bins"]
test_list = ["project_manager","service_line", "team", "building_region","budget_bins"]

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(dropna_df[test_list]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(test_list)
encode_df.head()

Unnamed: 0,project_manager_21,project_manager_24,project_manager_25,project_manager_26,project_manager_27,project_manager_37,project_manager_39,project_manager_40,project_manager_42,project_manager_43,...,"budget_bins_(29999, 34999]","budget_bins_(34999, 39999]","budget_bins_(39999, 44999]","budget_bins_(44999, 49999]","budget_bins_(49999, 59999]","budget_bins_(59999, 69999]","budget_bins_(69999, 79999]","budget_bins_(79999, 89999]","budget_bins_(89999, 99999]","budget_bins_(99999, 500000]"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Merge one-hot encoded features and drop the originals
merge_df = dropna_df.merge(encode_df,left_index=True, right_index=True)
merge_df = merge_df.drop(test_list,1)

merge_df=merge_df.drop(columns=['project_id','budget'])


merge_df.head()

Unnamed: 0,overbudget_yes,total_hours,emp_count,month_duration,project_manager_21,project_manager_24,project_manager_25,project_manager_26,project_manager_27,project_manager_37,...,"budget_bins_(29999, 34999]","budget_bins_(34999, 39999]","budget_bins_(39999, 44999]","budget_bins_(44999, 49999]","budget_bins_(49999, 59999]","budget_bins_(59999, 69999]","budget_bins_(69999, 79999]","budget_bins_(79999, 89999]","budget_bins_(89999, 99999]","budget_bins_(99999, 500000]"
0,0,114.0,38.0,9.199368,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,162.75,108.0,8.640835,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,84.5,60.0,10.743547,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,54.5,35.0,6.899526,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,148.0,94.0,27.400973,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
merge_df.shape


(484, 84)

In [9]:
#ml model
# Remove loan status target from features data
y = merge_df.overbudget_yes.values
X = merge_df.drop(columns=["overbudget_yes"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# SVM

In [10]:
# Create the SVM model
svm = SVC(kernel='linear')

In [11]:
# Train the model
svm.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [12]:
# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.554


# Logistic regression

In [13]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.479


# Random Forest

In [14]:
# Create a random forest classifier.
    # Typically, 128 estimators is the largest number of estimators we would want to use in a model
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.612


# Cross validation

In [15]:
from sklearn.model_selection import cross_validate

In [16]:
svm_cross = SVC(kernel='linear')

cv_results = cross_validate(svm_cross, X_train_scaled, y_train, cv=5)

In [17]:
sorted(cv_results.keys())

['fit_time', 'score_time', 'test_score']

In [18]:
cv_results['test_score']

array([0.50684932, 0.61643836, 0.56164384, 0.58333333, 0.51388889])

# Visualization

In [19]:
import pickle

In [20]:
Pkl_Filename = "Pickle_RL_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(rf_model, file)

In [21]:
# Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    pickled_rf_model = pickle.load(file)

pickled_rf_model

RandomForestClassifier(n_estimators=128, random_state=78)