In [17]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf

# Import our input dataset
test_df = pd.read_csv('project_cleaned_input.csv')
test_df.head()

Unnamed: 0,project_id,service_line,team,project_manager,overbudget_yes,budget,building_region,nda,total_hours,emp_count,month_duration
0,c0a1725b-89f9-e811-a966-000d3a37839b,Commissioning,SB,51,0,21400.0,SF,False,114.0,38.0,9.199368
1,cffdaaf0-a896-e811-a957-000d3a378ca2,Commissioning,SB,51,0,31865.0,SF,False,162.75,108.0,8.640835
2,b3ed8234-2775-e611-80ea-5065f38b4491,Commissioning,SB,51,0,36610.0,SF,False,84.5,60.0,10.743547
3,e4344949-aee8-e611-80f9-5065f38b4491,Commissioning,SB,51,1,8170.0,SF,False,54.5,35.0,6.899526
4,dda37c80-7e6e-e411-8b85-6c3be5a6aa7c,Commissioning,SB,51,0,20185.0,SF,False,148.0,94.0,27.400973


In [18]:
test_df.dtypes

project_id          object
service_line        object
team                object
project_manager      int64
overbudget_yes       int64
budget             float64
building_region     object
nda                   bool
total_hours        float64
emp_count          float64
month_duration     float64
dtype: object

In [19]:
# dropna
dropna_df = test_df.dropna()
dropna_df

Unnamed: 0,project_id,service_line,team,project_manager,overbudget_yes,budget,building_region,nda,total_hours,emp_count,month_duration
0,c0a1725b-89f9-e811-a966-000d3a37839b,Commissioning,SB,51,0,21400.0,SF,False,114.00,38.0,9.199368
1,cffdaaf0-a896-e811-a957-000d3a378ca2,Commissioning,SB,51,0,31865.0,SF,False,162.75,108.0,8.640835
2,b3ed8234-2775-e611-80ea-5065f38b4491,Commissioning,SB,51,0,36610.0,SF,False,84.50,60.0,10.743547
3,e4344949-aee8-e611-80f9-5065f38b4491,Commissioning,SB,51,1,8170.0,SF,False,54.50,35.0,6.899526
4,dda37c80-7e6e-e411-8b85-6c3be5a6aa7c,Commissioning,SB,51,0,20185.0,SF,False,148.00,94.0,27.400973
...,...,...,...,...,...,...,...,...,...,...,...
745,1a2d4656-632c-e711-8105-e0071b715b91,LEED,SB,58,1,34000.0,SF,True,248.25,138.0,16.493152
746,0476966e-6b62-e711-811c-e0071b7458a1,LEED,SB,67,0,42841.5,SF,True,276.75,162.0,18.300170
747,69f606ee-d3e5-e711-812d-e0071b7458a1,LEED,SB,44,1,32436.0,DV,True,233.50,174.0,23.589807
748,d072b6dd-5e01-e811-812f-e0071b7458a1,LEED,SB,84,0,39000.0,SF,True,198.00,108.0,16.887410


In [23]:
test_list = ["project_manager","service_line", "team", "building_region","nda"]

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(dropna_df[test_list]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(test_list)
encode_df.head()

Unnamed: 0,project_manager_9,project_manager_21,project_manager_24,project_manager_25,project_manager_26,project_manager_27,project_manager_33,project_manager_37,project_manager_39,project_manager_40,...,service_line_WELL,team_EN,team_OP,team_PM,team_SB,building_region_DV,building_region_SD,building_region_SF,nda_False,nda_True
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [28]:
# Merge one-hot encoded features and drop the originals
merge_df = dropna_df.merge(encode_df,left_index=True, right_index=True)
merge_df = merge_df.drop(test_list,1)

merge_df=merge_df.drop(columns=['project_id'])


merge_df.head()

Unnamed: 0,overbudget_yes,budget,total_hours,emp_count,month_duration,project_manager_9,project_manager_21,project_manager_24,project_manager_25,project_manager_26,...,service_line_WELL,team_EN,team_OP,team_PM,team_SB,building_region_DV,building_region_SD,building_region_SF,nda_False,nda_True
0,0,21400.0,114.0,38.0,9.199368,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0,31865.0,162.75,108.0,8.640835,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,0,36610.0,84.5,60.0,10.743547,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,1,8170.0,54.5,35.0,6.899526,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0,20185.0,148.0,94.0,27.400973,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [30]:
merge_df.shape


(724, 74)

In [31]:
# Remove loan status target from features data
y = merge_df.overbudget_yes.values
X = merge_df.drop(columns=["overbudget_yes"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# SVM

In [32]:
# Create the SVM model
svm = SVC(kernel='linear')

In [33]:
# Train the model
svm.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [34]:
# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.713


# Logistic regression

In [35]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.486


# Random Forest

In [36]:
# Create a random forest classifier.
    # Typically, 128 estimators is the largest number of estimators we would want to use in a model
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.652


# Cross validation

In [37]:
from sklearn.model_selection import cross_validate

In [38]:
svm_cross = SVC(kernel='linear')

cv_results = cross_validate(svm_cross, X_train_scaled, y_train, cv=5)

In [39]:
sorted(cv_results.keys())

['fit_time', 'score_time', 'test_score']

In [40]:
cv_results['test_score']

array([0.71559633, 0.59633028, 0.60550459, 0.68518519, 0.66666667])