## Decision Trees

## Set up data

In [120]:
# Load essential libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

# Read csv file
df = pd.read_csv('../data/cleaned_wf_demo.csv', index_col=0)
print(df.head())

          company  seniority job_category  gender ethnicity       count  \
1  The Home Depot          2        empty    male  multiple    1.339059   
2  The Home Depot          2        Sales  female    native   30.866820   
3  The Home Depot          3     Engineer  female     white  229.514520   
4  The Home Depot          1    Scientist  female    native    1.413377   
5  The Home Depot          2        Admin    male    native    2.650784   

     inflow   outflow        salary  
1  0.000470  0.008150  9.310531e+04  
2  0.462686  0.081448  1.952310e+06  
3  2.762419  5.267294  2.229767e+07  
4  0.000212  0.009232  6.196394e+04  
5  0.016046  0.008234  1.946259e+05  


## Reorganize data for Decision Trees

In [121]:
# Convert categorical variables to dummy variables usind OneHotEncoder

# Dummies for job_category
# Get dummies and rename new dummyies' names as job category names
dummies = pd.get_dummies(df['job_category']).rename(columns=lambda x: 'job_category_' + str(x))
df = pd.concat([df, dummies], axis=1) #add back into the dataframe
df.drop(['job_category'], inplace=True, axis = 1) #remove job_category column

# Repeat for gender
dummies = pd.get_dummies(df['gender']).rename(columns=lambda x: 'gender_' + str(x))
df = pd.concat([df, dummies], axis=1) #add back into the dataframe
df.drop(['gender'], inplace=True, axis = 1) #remove gender column

# Repeat for ethnicity
dummies = pd.get_dummies(df['ethnicity']).rename(columns=lambda x: 'ethnicity_' + str(x))
df = pd.concat([df, dummies], axis=1) #add back into the dataframe
df.drop(['ethnicity'], inplace=True, axis = 1) #remove ethnicity column

print(df.columns)
print(df.isnull().sum())

Index(['company', 'seniority', 'count', 'inflow', 'outflow', 'salary',
       'job_category_Admin', 'job_category_Engineer', 'job_category_Finance',
       'job_category_Marketing', 'job_category_Operations',
       'job_category_Sales', 'job_category_Scientist', 'job_category_empty',
       'gender_female', 'gender_male', 'ethnicity_api', 'ethnicity_black',
       'ethnicity_hispanic', 'ethnicity_multiple', 'ethnicity_native',
       'ethnicity_white'],
      dtype='object')
company                       0
seniority                     0
count                         0
inflow                        0
outflow                       0
salary                     2821
job_category_Admin            0
job_category_Engineer         0
job_category_Finance          0
job_category_Marketing        0
job_category_Operations       0
job_category_Sales            0
job_category_Scientist        0
job_category_empty            0
gender_female                 0
gender_male                   0
ethnici

In [116]:
## SET X AND Y

# y: CONVERT FROM STRING LABELS TO INTEGERS 
labels=[]; 
y=[]
for label in df["company"]:
    if label not in labels:
        labels.append(label)
        print("index =",len(labels)-1,": label =",label)
    for i in range(0,len(labels)):
        if(label==labels[i]):
            y.append(i)
y = np.array(y)

# X
X = df.drop(['company'], axis = 1)

# Double check
print(X.shape,y.shape)

index = 0 : label = The Home Depot
index = 1 : label = Databricks
index = 2 : label = Bristol Myers Squibb Co.
index = 3 : label = Accenture Plc
index = 4 : label = Apple
index = 5 : label = The Goldman Sachs Group
(42336, 21) (42336,)


In [117]:
# Load sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Partion data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4)

# Check 
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(33868, 21)
(33868,)
(8468, 21)
(8468,)


## Support Vector Classifier

In [118]:
# Initialize model

# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Fit model
fit1 = DecisionTreeClassifier(random_state=0)

fit1

In [119]:
# Fit to training data
from sklearn.utils.fixes import sklearn

fit1.fit(X_train, y_train) 

# USE ONE-HOT ENCODER TO CONVERT STRING TO FLOAT
# Predict on X_train
y_train_pred = fit1.predict(X_train)

ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# TEST ACCURACY
# Training set
print("Training set")
print("Accuracy: ", accuracy_score(y_train, y_train_pred) * 100) #accuracy score
print("Number of mislabeled points out of a total 4275 points: ", (y_train != y_train_pred).sum()) #mislabeled points

# Test set
y_test_pred = fit1.predict(X_test)
print("Test set")
print("Accuracy: ", accuracy_score(y_test, y_test_pred)*100) #accuracy score
print("Number of mislabeled points out of a total 1069 points: ", (y_test != y_test_pred).sum()) #mislabeled points

## Visualize data

In [None]:
model_accuracies = pd.DataFrame({'Set':['Training set','Test set'], 'Accuracy (%)': [accuracy_score(y_train, y_train_pred) * 100, accuracy_score(y_test, y_test_pred)*100]})
sns.barplot(data=model_accuracies, x="Set", y="Accuracy (%)").set(title = 'Accuracy of model for training vs test sets' )

# Save plot
plt.savefig("../501-project-website/images/DT_tweets_accuracy.png")

## Confusion matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(fit1 , X_test ,  y_test ,  cmap="Blues")

# Plot confusion matrix
plt.title("Confusion matrix for Decision Tree model")

# Save confusion matrix plot
plt.savefig("../501-project-website/images/DT_tweets_confusion_matrix.png")