In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenyanhui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd

from datazoo.model import ZooRandomForest
from datazoo.featurize import ZooFeaturizer

%load_ext autoreload
%autoreload 2

# Prepare Data

In [4]:
dict_label = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}


x_train = pd.read_csv('../data/ml/data_train.csv')
x_test = pd.read_csv('../data/ml/data_test.csv')


x_train = x_train.sample(frac=1,random_state=100).reset_index(drop=True)
print(len(x_train))

y_train = x_train.loc[:,['y_act']]
y_test = x_test.loc[:,['y_act']]

y_train['y_act'] = [dict_label[i] for i in y_train['y_act']]
y_test['y_act'] = [dict_label[i] for i in y_test['y_act']]

7936


# Train and Test Model

In [5]:
model = ZooRandomForest()

In [6]:
model.fit(x_train, y_train)

[W 220320 22:49:54 base:46] vectorizer not fitted yet
[I 220320 22:49:54 base:30] vectorizer not fitted. Doing fit and transform
[I 220320 22:49:55 base:38] total length of from feature extraction: 7936


Fitting 5 folds for each of 16 candidates, totalling 80 fits


[I 220320 22:50:50 random_forest:28] best params: {'bootstrap': True, 'max_depth': 100, 'n_estimators': 500}


In [7]:
model.score(x_test, y_test, training=True)

[I 220320 22:50:50 base:27] vectorizer already fitted. Doing transform
[I 220320 22:50:50 base:38] total length of from feature extraction: 1985


0.9329974811083124

# Test Model with Featurizer for Input Raw Feature Files

In [8]:
#raw_data = pd.read_csv('../data/datasets/accident.csv')
#raw_data = pd.read_csv('../data/datasets/bbc.csv')
raw_data = pd.read_csv('../data/datasets/boxing.csv')
raw_data

Unnamed: 0,Judge,Official,Round,Winner
0,G._Hamada,Yes,1,de_la_Hoya
1,G._Hamada,Yes,2,Trinidad
2,G._Hamada,Yes,3,de_la_Hoya
3,G._Hamada,Yes,4,Trinidad
4,G._Hamada,Yes,5,de_la_Hoya
...,...,...,...,...
127,van_de_Wiele,No,8,de_la_Hoya
128,van_de_Wiele,No,9,Trinidad
129,van_de_Wiele,No,10,Trinidad
130,van_de_Wiele,No,11,Trinidad


In [9]:
fe = ZooFeaturizer(model.vectorizer)

In [10]:
data_featurized = fe.featurize(raw_data)
data_featurized

[I 220320 22:50:51 base:27] vectorizer already fitted. Doing transform
[I 220320 22:50:51 base:38] total length of from feature extraction: 4


Unnamed: 0,total_vals,num_nans,%_nans,num_of_dist_val,%_dist_val,mean,std_dev,min_val,max_val,has_delimiters,...,1443,1444,1445,1446,1447,1448,1449,1450,1451,1452
0,132,0,8.333333,11,8.333333,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
1,132,0,1.515152,2,1.515152,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
2,132,0,9.090909,12,9.090909,6.5,3.452053,1.0,12.0,False,...,0,0,0,0,0,0,0,0,0,0
3,132,0,1.515152,2,1.515152,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y_rf = model.predict(data_featurized).tolist()
y_rf



[1, 1, 0, 1]

In [12]:
dict_int_label = {
    0: 'numeric',
    1: 'categorical',
    2: 'datetime',
    3: 'sentence',
    4: 'url',
    5: 'embedded-number',
    6: 'list',
    7: 'not-generalizable',
    8: 'context-specific'
}

 
for name, int_lable in zip(raw_data.columns, y_rf):
    print(f'Column:  {name}\nType: {dict_int_label[int_lable]}\n')

Column:  Judge
Type: categorical

Column:  Official
Type: categorical

Column:  Round
Type: numeric

Column:  Winner
Type: categorical

