In [26]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [27]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenyanhui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
import pandas as pd
import joblib

from datazoo.model import ZooRandomForest
from datazoo.featurize import ZooFeaturizer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prepare Data

In [29]:
dict_label = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}


x_train = pd.read_csv('../data/ml/data_train.csv')
x_test = pd.read_csv('../data/ml/data_test.csv')


x_train = x_train.sample(frac=1,random_state=100).reset_index(drop=True)
print(len(x_train))

y_train = x_train.loc[:,['y_act']]
y_test = x_test.loc[:,['y_act']]

y_train['y_act'] = [dict_label[i] for i in y_train['y_act']]
y_test['y_act'] = [dict_label[i] for i in y_test['y_act']]

7936


# Train and Test Model

In [30]:
model = ZooRandomForest()

In [31]:
model.fit(x_train, y_train)

[W 220321 21:52:39 base:44] vectorizer not fitted yet
[I 220321 21:52:39 base:28] vectorizer not fitted. Doing fit and transform
[I 220321 21:52:40 base:36] total length of from feature extraction: 7936


Fitting 5 folds for each of 16 candidates, totalling 80 fits


[I 220321 21:53:31 random_forest:28] best params: {'bootstrap': True, 'max_depth': 250, 'n_estimators': 500}


In [32]:
model.score(x_test, y_test, training=True)

[I 220321 21:53:31 base:25] vectorizer already fitted. Doing transform
[I 220321 21:53:31 base:36] total length of from feature extraction: 1985


0.9355163727959698

In [33]:
joblib.dump(model, '../artifacts/model_rf.joblib')

['../artifacts/model_rf.joblib']

# Test Model with Featurizer for Input Raw Feature Files

In [34]:
fe = ZooFeaturizer(model.vectorizer)

In [35]:
joblib.dump(fe, '../artifacts/featurizer.joblib')

['../artifacts/featurizer.joblib']

In [36]:
#raw_data = pd.read_csv('../data/datasets/accident.csv')
#raw_data = pd.read_csv('../data/datasets/bbc.csv')
raw_data = pd.read_csv('../data/datasets/boxing.csv')
raw_data

Unnamed: 0,Judge,Official,Round,Winner
0,G._Hamada,Yes,1,de_la_Hoya
1,G._Hamada,Yes,2,Trinidad
2,G._Hamada,Yes,3,de_la_Hoya
3,G._Hamada,Yes,4,Trinidad
4,G._Hamada,Yes,5,de_la_Hoya
...,...,...,...,...
127,van_de_Wiele,No,8,de_la_Hoya
128,van_de_Wiele,No,9,Trinidad
129,van_de_Wiele,No,10,Trinidad
130,van_de_Wiele,No,11,Trinidad


In [37]:
data_featurized = fe.featurize(raw_data)
data_featurized

[I 220321 21:54:35 base:25] vectorizer already fitted. Doing transform
[I 220321 21:54:35 base:36] total length of from feature extraction: 4


Unnamed: 0,total_vals,num_nans,%_nans,num_of_dist_val,%_dist_val,mean,std_dev,min_val,max_val,has_delimiters,...,1443,1444,1445,1446,1447,1448,1449,1450,1451,1452
0,132,0,8.333333,11,8.333333,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
1,132,0,1.515152,2,1.515152,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
2,132,0,9.090909,12,9.090909,6.5,3.452053,1.0,12.0,False,...,0,0,0,0,0,0,0,0,0,0
3,132,0,1.515152,2,1.515152,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0


In [38]:
y_rf = model.predict(data_featurized).tolist()
y_rf



[1, 1, 0, 1]

In [39]:
dict_int_label = {
    0: 'numeric',
    1: 'categorical',
    2: 'datetime',
    3: 'sentence',
    4: 'url',
    5: 'embedded-number',
    6: 'list',
    7: 'not-generalizable',
    8: 'context-specific'
}

 
for name, int_lable in zip(raw_data.columns, y_rf):
    print(f'Column:  {name}\nType: {dict_int_label[int_lable]}\n')

Column:  Judge
Type: categorical

Column:  Official
Type: categorical

Column:  Round
Type: numeric

Column:  Winner
Type: categorical

[CV] END ......bootstrap=True, max_depth=10, n_estimators=25; total time=   1.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=   1.7s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=25; total time=   1.4s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=50; total time=   2.0s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=100; total time=   3.2s
[CV] END .....bootstrap=True, max_depth=100, n_estimators=25; total time=   1.4s
[CV] END .....bootstrap=True, max_depth=100, n_estimators=50; total time=   2.1s
[CV] END ....bootstrap=True, max_depth=100, n_estimators=100; total time=   3.5s
[CV] END ....bootstrap=True, max_depth=100, n_estimators=500; total time=  13.6s
[CV] END ......bootstrap=True, max_depth=10, n_estimators=50; total time=   1.4s
[CV] END .....bootstrap=True, max_depth=10, n_estimato



[CV] END ......bootstrap=True, max_depth=10, n_estimators=25; total time=   1.1s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=   1.6s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=25; total time=   1.3s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=50; total time=   2.1s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=100; total time=   3.3s
[CV] END .....bootstrap=True, max_depth=100, n_estimators=25; total time=   1.5s
[CV] END .....bootstrap=True, max_depth=100, n_estimators=50; total time=   2.1s
[CV] END ....bootstrap=True, max_depth=100, n_estimators=100; total time=   3.4s
[CV] END ....bootstrap=True, max_depth=100, n_estimators=500; total time=  14.0s
[CV] END ......bootstrap=True, max_depth=10, n_estimators=25; total time=   1.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=   1.6s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=25; total time=   1.4s
[CV] END ......bootstrap=Tru



[CV] END ......bootstrap=True, max_depth=10, n_estimators=50; total time=   1.3s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=500; total time=   4.8s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=100; total time=   3.4s
[CV] END .....bootstrap=True, max_depth=100, n_estimators=25; total time=   1.6s
[CV] END .....bootstrap=True, max_depth=100, n_estimators=50; total time=   2.2s
[CV] END ....bootstrap=True, max_depth=100, n_estimators=100; total time=   3.5s
[CV] END ....bootstrap=True, max_depth=100, n_estimators=500; total time=  13.8s




[CV] END ......bootstrap=True, max_depth=10, n_estimators=25; total time=   1.1s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=   1.6s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=25; total time=   1.4s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=50; total time=   2.1s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=500; total time=  13.3s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=25; total time=   1.5s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=50; total time=   2.1s
[CV] END ....bootstrap=True, max_depth=250, n_estimators=100; total time=   3.3s
[CV] END ....bootstrap=True, max_depth=250, n_estimators=500; total time=  11.9s
[CV] END ......bootstrap=True, max_depth=10, n_estimators=50; total time=   1.4s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=500; total time=   4.8s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=500; total time=  13.4s
[CV] END .....bootstrap=True



[CV] END ......bootstrap=True, max_depth=10, n_estimators=25; total time=   1.1s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=   1.8s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=25; total time=   1.5s
[CV] END ......bootstrap=True, max_depth=50, n_estimators=50; total time=   2.1s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=500; total time=  13.3s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=25; total time=   1.4s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=50; total time=   2.2s
[CV] END ....bootstrap=True, max_depth=250, n_estimators=100; total time=   3.4s
[CV] END ....bootstrap=True, max_depth=250, n_estimators=500; total time=  12.0s




[CV] END ......bootstrap=True, max_depth=10, n_estimators=50; total time=   1.4s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=500; total time=   5.0s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=500; total time=  13.4s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=25; total time=   1.5s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=50; total time=   2.3s
[CV] END ....bootstrap=True, max_depth=250, n_estimators=100; total time=   3.6s
[CV] END ....bootstrap=True, max_depth=250, n_estimators=500; total time=  11.7s
[CV] END ......bootstrap=True, max_depth=10, n_estimators=50; total time=   1.4s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=500; total time=   4.9s
[CV] END .....bootstrap=True, max_depth=50, n_estimators=500; total time=  13.5s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=25; total time=   1.5s
[CV] END .....bootstrap=True, max_depth=250, n_estimators=50; total time=   2.4s
[CV] END ....bootstrap=True,



In [45]:
data_featurized.columns[:25]

Index(['total_vals', 'num_nans', '%_nans', 'num_of_dist_val', '%_dist_val',
       'mean', 'std_dev', 'min_val', 'max_val', 'has_delimiters', 'has_url',
       'has_email', 'has_date', 'mean_word_count', 'stdev_word_count',
       'mean_stopword_total', 'stdev_stopword_total', 'mean_char_count',
       'stdev_char_count', 'mean_whitespace_count', 'stdev_whitespace_count',
       'mean_delim_count', 'stdev_delim_count', 'is_list', 'is_long_sentence'],
      dtype='object')

In [50]:
df2 = data_featurized.iloc[1][:25]
df2 

total_vals                     132
num_nans                         0
%_nans                    1.515152
num_of_dist_val                  2
%_dist_val                1.515152
mean                           0.0
std_dev                        0.0
min_val                        0.0
max_val                        0.0
has_delimiters               False
has_url                      False
has_email                    False
has_date                     False
mean_word_count                1.0
stdev_word_count               0.0
mean_stopword_total            0.0
stdev_stopword_total           0.0
mean_char_count                2.6
stdev_char_count          0.489898
mean_whitespace_count          0.0
stdev_whitespace_count         0.0
mean_delim_count               0.0
stdev_delim_count              0.0
is_list                      False
is_long_sentence             False
Name: 1, dtype: object

In [51]:
df2.to_dict()

{'total_vals': 132,
 'num_nans': 0,
 '%_nans': 1.5151515151515151,
 'num_of_dist_val': 2,
 '%_dist_val': 1.5151515151515151,
 'mean': 0.0,
 'std_dev': 0.0,
 'min_val': 0.0,
 'max_val': 0.0,
 'has_delimiters': False,
 'has_url': False,
 'has_email': False,
 'has_date': False,
 'mean_word_count': 1.0,
 'stdev_word_count': 0.0,
 'mean_stopword_total': 0.0,
 'stdev_stopword_total': 0.0,
 'mean_char_count': 2.6,
 'stdev_char_count': 0.4898979485566356,
 'mean_whitespace_count': 0.0,
 'stdev_whitespace_count': 0.0,
 'mean_delim_count': 0.0,
 'stdev_delim_count': 0.0,
 'is_list': False,
 'is_long_sentence': False}