In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [24]:
df_chunk = pd.read_csv('df_mean_cred_score.csv', usecols=['full_name', 'slug', 'job_title', 'brand_name', 'is_c_level',  'is_current', 'growth_score',
                                                          'sudden_growth_score', 'sudden_growth_delta', 'acceleration_score', 'success_score', 
                                                          'is_physical_products_brand', 'is_brickandmortar','generic_reseller', 
                                                          'store_is_subdomain','cred_score_weighted_mean_mean','cred_score_weighted_mean_max', 
                                                          'num_brand_count','position_weight', 'experience_w_brand_ratio', 'cred_score_weighted_median'], chunksize=500)

In [25]:
%%time
#This is in case you use chunks in pandas to upload your file.
chunk_list = []  # append each chunk df here 

# Each chunk is in df format
for chunk in df_chunk:
    # Once the data filtering is done, append the chunk to list
    chunk_list.append(chunk)
    
# concat the list into dataframe 
df = pd.concat(chunk_list)
del chunk_list

CPU times: user 21.5 s, sys: 368 ms, total: 21.9 s
Wall time: 23.7 s


In [26]:
#The loyalty column
df.loc[(df['num_brand_count'] >=2),'num_brand_association'] = 0
df.loc[(df['num_brand_count'] <2),'num_brand_association'] = 1

In [9]:
%%time
#here I am not using experience with brand ratio, that seems like a internal factor to just calculate score
X = df[['is_current','is_c_level', 'is_physical_products_brand', 'is_brickandmortar', 'generic_reseller',
       'store_is_subdomain', 'num_brand_association', 'cred_score_weighted_median']]
y = df['growth_score']
#Let me use stats model first to run a Linear regression on my dataset and look at the r^2 value
import statsmodels.api as sm
X_withconstant = sm.add_constant(X)
# 1. Instantiate model
lm_cs_1 = sm.OLS(y.astype(float),X_withconstant.astype(float))

# 2. Fit model
lm_cs_1 = lm_cs_1.fit()

lm_cs_1.summary()

  return getattr(obj, method)(*args, **kwds)


CPU times: user 1.35 s, sys: 1.21 s, total: 2.57 s
Wall time: 2.36 s


In [10]:
lm_cs_1.summary()

0,1,2,3
Dep. Variable:,growth_score,R-squared:,0.225
Model:,OLS,Adj. R-squared:,0.225
Method:,Least Squares,F-statistic:,54970.0
Date:,"Tue, 09 Jun 2020",Prob (F-statistic):,0.0
Time:,20:50:39,Log-Likelihood:,-6915400.0
No. Observations:,1511854,AIC:,13830000.0
Df Residuals:,1511845,BIC:,13830000.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,28.6825,0.083,347.324,0.000,28.521,28.844
is_current,-8.1794,0.060,-135.701,0.000,-8.298,-8.061
is_c_level,-7.7381,0.050,-153.466,0.000,-7.837,-7.639
is_physical_products_brand,8.3676,0.043,195.876,0.000,8.284,8.451
is_brickandmortar,0.8500,0.039,21.754,0.000,0.773,0.927
generic_reseller,-8.2630,0.331,-24.944,0.000,-8.912,-7.614
store_is_subdomain,-12.5872,0.075,-166.803,0.000,-12.735,-12.439
num_brand_association,-11.5548,0.067,-172.418,0.000,-11.686,-11.423
cred_score_weighted_median,0.2010,0.000,531.272,0.000,0.200,0.202

0,1,2,3
Omnibus:,121783.58,Durbin-Watson:,0.547
Prob(Omnibus):,0.0,Jarque-Bera (JB):,156300.828
Skew:,0.726,Prob(JB):,0.0
Kurtosis:,3.611,Cond. No.,925.0


In [14]:
%%time
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
#from tempfile import mkdtemp
#cachedir = mkdtemp()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

CPU times: user 1.73 s, sys: 163 ms, total: 1.9 s
Wall time: 1.94 s


In [15]:
%%time
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

ran_reg_2 = RandomForestRegressor(max_features = 4, max_depth =100)
ran_reg_2.fit(X_train, y_train)



CPU times: user 1min 15s, sys: 407 ms, total: 1min 15s
Wall time: 1min 23s


In [16]:
pred_2 = ran_reg_2.predict(X_test)
np.sqrt(mean_squared_error(y_test, pred_2))

20.267776372975405

In [17]:
r2_score(y_test, pred_2)

0.4209100222766128

In [12]:
def get_cv_scores(model):
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring='r2')
    
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

In [18]:
feature_importance_rf = ran_reg_2.feature_importances_*100
rel_imp = pd.Series(feature_importance_rf, index = X.columns).sort_values(inplace = False)
print(rel_imp)

generic_reseller               0.024729
is_brickandmortar              0.242635
num_brand_association          0.935436
is_c_level                     1.285323
is_current                     1.580444
is_physical_products_brand     2.655380
store_is_subdomain             2.993113
cred_score_weighted_median    90.282942
dtype: float64


In [19]:
#Let us try fitting a gradient boost regressor to our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

regrboost_1 = GradientBoostingRegressor(n_estimators = 100, learning_rate = .01, random_state =1, max_depth =10)
regrboost_1.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=10,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             n_iter_no_change=None, presort='auto', random_state=1,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [20]:
pred_2_boost = regrboost_1.predict(X_test)
np.sqrt(mean_squared_error(y_test, pred_2_boost))

19.04707974890293

In [21]:
r2_score(y_test, pred_2_boost)

0.488564769420484

In [27]:
X = df[['is_current','is_c_level', 'is_physical_products_brand', 'is_brickandmortar', 'generic_reseller',
       'store_is_subdomain', 'num_brand_association', 'cred_score_weighted_median']]
y = df['growth_score']

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [29]:
#Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
#Shape
X_train.shape , X_test.shape

((1058297, 8), (453557, 8))

# Creating an artificial neural network model for regression

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

ModuleNotFoundError: No module named 'tensorflow'