## Imports

In [32]:
import pandas as pd
import seaborn as sns
import numpy as np
import shap
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from statsmodels.graphics.regressionplots import plot_partregress

# Client Views

## Load and Clean Data

In [33]:
df = pd.read_csv('client.csv')
#df.head()

In [34]:
#df.shape

In [35]:
cv = df.drop(columns=['profileviewtimestamp', 'candidateId',
                      'profileid', 'firstname', 'lastname', 'school',
                      'company', 'position'])
#cv.head()

## Replace Null Values

In [36]:
cv.isnull().sum()

profilepicture        0
pitchvideo            0
searchstatus          0
gender             2372
race               2407
graduationyear      637
degree               12
monthsatcompany       0
profileviews          0
dtype: int64

In [37]:
cv.dropna(subset=['degree'], inplace=True)
cv['gender'].fillna("Unknown", inplace=True)
cv['race'].fillna("Unknown", inplace=True)
cv.dropna(subset=['graduationyear'], inplace=True)

In [38]:
cv.isnull().sum()

profilepicture     0
pitchvideo         0
searchstatus       0
gender             0
race               0
graduationyear     0
degree             0
monthsatcompany    0
profileviews       0
dtype: int64

In [39]:
#cv.shape

## Declare Independent Variables and Target Variable

In [40]:
X = cv.drop(columns=['profileviews'])
y = cv['profileviews']
#X.head()

## Encode Categorical Variables for the Model

In [41]:
X = pd.get_dummies(X, columns=['gender', 'race', 'degree'])
X = X.drop(columns=['gender_Unknown', 'race_Unknown', 'degree_Other'])
#X.head()

## Check Distribution and Remove Outliers

In [42]:
#sns.displot(cv.graduationyear, rug=True)

In [43]:
#sns.displot(cv.monthsatcompany, rug=True)

In [44]:
cv = cv.loc[cv['graduationyear'] > 1982]
cv = cv.loc[cv['monthsatcompany'] < 250]

## Choose Model Based on Accuracy Score

In [45]:
mse_gb = -cross_val_score(GradientBoostingRegressor(), X, y, cv=100, scoring='neg_root_mean_squared_error')
mse_rf = -cross_val_score(RandomForestRegressor(n_estimators=1000), X, y, cv=100, scoring = 'neg_root_mean_squared_error')

print('Average MSE for GradientBoostingRegressor is {0:.3f}'.format(np.mean(mse_gb)))
print('Average MSE for RandomForestRegressor is {0:.3f}'.format(np.mean(mse_rf)))

Average MSE for GradientBoostingRegressor is 1.182
Average MSE for RandomForestRegressor is 1.308


## Check for Variable Correlations with Scatter Plot

In [46]:
#sns.regplot(X['graduationyear'], y, lowess=True)
#plt.ylabel('Profile Views')
#plt.xlabel('Graduation Year')
#plt.title('Graduation Year vs. Profile Views')
#plt.show()

## Build Model

In [47]:
rf = RandomForestRegressor(n_estimators=1000).fit(X,y)

In [48]:
gb = GradientBoostingRegressor().fit(X,y)

## Check Which Features are Most Important

In [49]:
importance = shap.TreeExplainer(gb)
shap_values = importance.shap_values(X)

In [50]:
#shap.summary_plot(shap_values, X, plot_type='bar')

## Plot Partial Dependence of Numerical Variables

In [51]:
#gy_values = np.linspace(np.min(X['graduationyear']), np.max(X['graduationyear']))

#pdp_gy = []
#for n in gy_values:
#    X_gy = X.copy()
#    X_gy['graduationyear'] = n
#    pdp_gy.append(np.mean(gb.predict(X_gy)))
    
#plt.plot(gy_values, pdp_gy)
#plt.ylabel('Predicted Profile Views')
#plt.xlabel('Graduation Year')
#plt.title('Partial Dependence Plot for Graduation Year vs Profile Views')
#plt.show()

In [52]:
#mc_values = np.linspace(np.min(X['monthsatcompany']), np.max(X['monthsatcompany']))

#pdp_mc = []
#for n in mc_values:
#    X_mc = X.copy()
#    X_mc['monthsatcompany'] = n
#    pdp_mc.append(np.mean(gb.predict(X_mc)))
    
#plt.plot(mc_values, pdp_mc)
#plt.ylabel('Predicted Profile Views')
#plt.xlabel('Months at Highlighted Company')
#plt.title('Partial Dependence Plot for Months at Highlighted Company vs Profile Views')
#plt.show()

## Plot Partial Dependence of Categorical Variables

In [53]:
#ss_values = np.array(X['searchstatus'])
#pdp_ss = []
#for n in ss_values:
#    X_ss = X.copy()
#    X_ss['searchstatus'] = n
#    pdp_ss.append(np.mean(gb.predict(X_ss)))
    
#plt.bar(ss_values, pdp_ss, width=0.5)
#plt.ylabel('Predicted Profile Views')
#plt.xlabel('Search Status')
#plt.title('Partial Dependence Plot for Search Status vs Profile Views')
#plt.show()


In [54]:
#pp_values = np.array(X['profilepicture'])
#pdp_pp = []
#for n in pp_values:
#    X_pp = X.copy()
#    X_pp['profilepicture'] = n
#    pdp_pp.append(np.mean(gb.predict(X_pp)))
    
#plt.bar(pp_values, pdp_pp, width=0.5)
#plt.ylabel('Predicted Profile Views')
#plt.xlabel('Profile Picture')
#plt.title('Partial Dependence Plot for Profile Picture vs Profile Views')
#plt.show()


In [55]:
#pv_values = np.array(X['pitchvideo'])
#pdp_pv = []
#for n in pv_values:
#    X_pv = X.copy()
#    X_pv['profilepicture'] = n
#    pdp_pv.append(np.mean(gb.predict(X_pv)))
    
#plt.bar(pv_values, pdp_pv, width=0.5)
#plt.ylabel('Predicted Profile Views')
#plt.xlabel('Pitch Video')
#plt.title('Partial Dependence Plot for Pitch Video vs Profile Views')
#plt.show()