In [25]:
#Copyright 2020 Vraj Shah, Arun Kumar
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.


import csv
import pandas as pd
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn import metrics

import pickle
import math
import re
import enchant
import os
import glob
import numpy as np
np.random.seed(512)

In [26]:
xtrain = pd.read_csv('../../Benchmark-Labeled-Data/data_train.csv')
xtest = pd.read_csv('../../Benchmark-Labeled-Data/data_test.csv')


xtrain = xtrain.sample(frac=1,random_state=100).reset_index(drop=True)
print(len(xtrain))

y_train = xtrain.loc[:,['y_act']]
y_test = xtest.loc[:,['y_act']]

7936


In [27]:
dict_label = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

y_train['y_act'] = [dict_label[i] for i in y_train['y_act']]
y_test['y_act'] = [dict_label[i] for i in y_test['y_act']]
y_train


Unnamed: 0,y_act
0,0
1,0
2,0
3,1
4,0
...,...
7931,0
7932,1
7933,8
7934,7


In [28]:
useStats = 1
useAttributeName = 1
useSample1 = 0
useSample2 = 0
## Using descriptive stats and attribute name

In [29]:
def ProcessStats(data,y):

    data1 = data[['total_vals', 'num_nans', '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val', 'max_val','has_delimiters', 'has_url', 'has_email', 'has_date', 'mean_word_count',
       'std_dev_word_count', 'mean_stopword_total', 'stdev_stopword_total',
       'mean_char_count', 'stdev_char_count', 'mean_whitespace_count',
       'stdev_whitespace_count', 'mean_delim_count', 'stdev_delim_count',
       'is_list', 'is_long_sentence']]
    data1 = data1.reset_index(drop=True)
    data1 = data1.fillna(0)

    y.y_act = y.y_act.astype(float)
    
    return data1


vectorizerName = CountVectorizer(ngram_range=(2, 2), analyzer='char')
vectorizerSample = CountVectorizer(ngram_range=(2, 2), analyzer='char')

def FeatureExtraction(data,data1,flag):

    arr = data['Attribute_name'].values
    arr = [str(x) for x in arr]
    
    arr1 = data['sample_1'].values
    arr1 = [str(x) for x in arr1]
    arr2 = data['sample_2'].values
    arr2 = [str(x) for x in arr2]
    arr3 = data['sample_3'].values
    arr3 = [str(x) for x in arr3]    
    print(len(arr1),len(arr2))
    if flag:
        X = vectorizerName.fit_transform(arr)
        X1 = vectorizerSample.fit_transform(arr1)
        X2 = vectorizerSample.transform(arr2)   
        
    else:
        X = vectorizerName.transform(arr)
        X1 = vectorizerSample.transform(arr1)
        X2 = vectorizerSample.transform(arr2)        
        
#     print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

    attr_df = pd.DataFrame(X.toarray())
    sample1_df = pd.DataFrame(X1.toarray())
    sample2_df = pd.DataFrame(X2.toarray())
    print(len(data1),len(attr_df),len(sample1_df),len(sample2_df))

    if useSample1: data2 = sample1_df
    if useSample2: data2 = sample2_df    
    
    data2 = pd.concat([data1, attr_df], axis=1, sort=False)
    print(len(data2))
    return data2

In [52]:
xtrain1 = ProcessStats(xtrain,y_train)
xtest1 = ProcessStats(xtest,y_test)


X_train = FeatureExtraction(xtrain,xtrain1,1)
X_test = FeatureExtraction(xtest,xtest1,0)


X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
X_train_new = X_train_new.values
y_train_new = y_train_new.values


k = 5
kf = KFold(n_splits=k,random_state = 100, shuffle=True)
avg_train_acc,avg_test_acc = 0,0

n_estimators_grid = [5,25,50,75,100,500]
max_depth_grid = [5,10,25,50,100,250]
criterion_grid = ['gini', 'entropy', 'log_loss']
# criterion_grid = ['log_loss']

# n_estimators_grid = [25,50,75,100]
# max_depth_grid = [50,100]

avgsc_lst,avgsc_train_lst,avgsc_hld_lst = [],[],[]
avgsc,avgsc_train,avgsc_hld = 0,0,0

best_param_count = {'n_estimator': {}, 'max_depth': {}}
i=0
for train_index, test_index in kf.split(X_train_new):
#     if i==1: break
    i=i+1
    X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
    y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
    y_test_cur = [str(val) for val in y_test_cur]
    X_train_train, X_val,y_train_train,y_val = train_test_split(X_train_cur,y_train_cur, test_size=0.25,random_state=100)

    bestPerformingModel = RandomForestClassifier(n_estimators=10,max_depth=5,random_state=100)
    bestscore = 0
    print('='*10)
    for ne in n_estimators_grid:
        for md in max_depth_grid:
            for cr in criterion_grid:
                clf = RandomForestClassifier(n_estimators=ne,max_depth=md, criterion=cr, random_state=100)
                clf.fit(X_train_train, y_train_train.ravel())
                sc = clf.score(X_val, y_val)
                print(f"[n_estimator: {ne}, max_depth: {md}, accuracy: {sc}]")
                if bestscore < sc:
                    bestne = ne
                    bestmd = md
                    bestcr = cr
                    bestscore = sc
                    bestPerformingModel = clf

    if str(bestne) in best_param_count['n_estimator']:
        best_param_count['n_estimator'][str(bestne)] += 1
    else:
        best_param_count['n_estimator'][str(bestne)] = 1

    if str(bestmd) in best_param_count['max_depth']:
        best_param_count['max_depth'][str(bestmd)] += 1
    else:
        best_param_count['max_depth'][str(bestmd)] = 1

    bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
    bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
    bscr_hld = bestPerformingModel.score(X_test.to_numpy(), y_test)

    avgsc_train_lst.append(bscr_train)
    avgsc_lst.append(bscr)
    avgsc_hld_lst.append(bscr_hld)

    avgsc_train = avgsc_train + bscr_train    
    avgsc = avgsc + bscr
    avgsc_hld = avgsc_hld + bscr_hld

    print()
    print(f"> Best n_estimator: {bestne} || Best max_depth: {bestmd}")
    print(f"> Best training score: {bscr_train}")
    print(f"> Best test score: {bscr}")
    print(f"> Best held score: {bscr_hld}")
print('='*10)

print(avgsc_train_lst)
print(avgsc_lst)
print(avgsc_hld_lst)

print(avgsc_train/k)
print(avgsc/k)
print(avgsc_hld/k)

y_pred = bestPerformingModel.predict(X_test.to_numpy())
bscr_hld = bestPerformingModel.score(X_test.to_numpy(), y_test)
print(bscr_hld)

7936 7936
7936 7936 7936 7936
7936
1985 1985
1985 1985 1985 1985
1985
[n_estimator: 5, max_depth: 5, accuracy: 0.630119722747322]
[n_estimator: 5, max_depth: 10, accuracy: 0.7334593572778828]
[n_estimator: 5, max_depth: 25, accuracy: 0.8664146187775678]
[n_estimator: 5, max_depth: 50, accuracy: 0.8827977315689981]
[n_estimator: 5, max_depth: 100, accuracy: 0.8878386893509767]
[n_estimator: 5, max_depth: 250, accuracy: 0.8878386893509767]
[n_estimator: 25, max_depth: 5, accuracy: 0.6956521739130435]
[n_estimator: 25, max_depth: 10, accuracy: 0.8065532451165721]
[n_estimator: 25, max_depth: 25, accuracy: 0.8947700063011972]
[n_estimator: 25, max_depth: 50, accuracy: 0.9199747952110902]
[n_estimator: 25, max_depth: 100, accuracy: 0.9193446754883428]
[n_estimator: 25, max_depth: 250, accuracy: 0.9193446754883428]
[n_estimator: 50, max_depth: 5, accuracy: 0.6994328922495274]
[n_estimator: 50, max_depth: 10, accuracy: 0.7908002520478891]
[n_estimator: 50, max_depth: 25, accuracy: 0.898550724

  score = y_true == y_pred



> Best n_estimator: 500 || Best max_depth: 100
> Best training score: 0.9809388783868935
> Best test score: 0.0
> Best held score: 0.9239294710327456
[n_estimator: 5, max_depth: 5, accuracy: 0.7437027707808564]
[n_estimator: 5, max_depth: 10, accuracy: 0.7865239294710328]
[n_estimator: 5, max_depth: 25, accuracy: 0.8690176322418136]
[n_estimator: 5, max_depth: 50, accuracy: 0.8998740554156172]
[n_estimator: 5, max_depth: 100, accuracy: 0.8929471032745592]
[n_estimator: 5, max_depth: 250, accuracy: 0.8929471032745592]
[n_estimator: 25, max_depth: 5, accuracy: 0.7285894206549118]
[n_estimator: 25, max_depth: 10, accuracy: 0.7865239294710328]
[n_estimator: 25, max_depth: 25, accuracy: 0.9030226700251889]
[n_estimator: 25, max_depth: 50, accuracy: 0.9219143576826196]
[n_estimator: 25, max_depth: 100, accuracy: 0.9193954659949622]
[n_estimator: 25, max_depth: 250, accuracy: 0.9193954659949622]
[n_estimator: 50, max_depth: 5, accuracy: 0.7059193954659949]
[n_estimator: 50, max_depth: 10, ac

  score = y_true == y_pred



> Best n_estimator: 50 || Best max_depth: 50
> Best training score: 0.9812568908489526
> Best test score: 0.0
> Best held score: 0.9153652392947104
[n_estimator: 5, max_depth: 5, accuracy: 0.6914357682619647]
[n_estimator: 5, max_depth: 10, accuracy: 0.7934508816120907]
[n_estimator: 5, max_depth: 25, accuracy: 0.8683879093198993]
[n_estimator: 5, max_depth: 50, accuracy: 0.8998740554156172]
[n_estimator: 5, max_depth: 100, accuracy: 0.9011335012594458]
[n_estimator: 5, max_depth: 250, accuracy: 0.9011335012594458]
[n_estimator: 25, max_depth: 5, accuracy: 0.7134760705289672]
[n_estimator: 25, max_depth: 10, accuracy: 0.8230478589420654]
[n_estimator: 25, max_depth: 25, accuracy: 0.9086901763224181]
[n_estimator: 25, max_depth: 50, accuracy: 0.924433249370277]
[n_estimator: 25, max_depth: 100, accuracy: 0.9256926952141058]
[n_estimator: 25, max_depth: 250, accuracy: 0.9256926952141058]
[n_estimator: 50, max_depth: 5, accuracy: 0.6574307304785895]
[n_estimator: 50, max_depth: 10, accur

  score = y_true == y_pred



> Best n_estimator: 500 || Best max_depth: 100
> Best training score: 0.9823594266813671
> Best test score: 0.0
> Best held score: 0.9229219143576827
[n_estimator: 5, max_depth: 5, accuracy: 0.6530226700251889]
[n_estimator: 5, max_depth: 10, accuracy: 0.760705289672544]
[n_estimator: 5, max_depth: 25, accuracy: 0.8772040302267002]
[n_estimator: 5, max_depth: 50, accuracy: 0.889168765743073]
[n_estimator: 5, max_depth: 100, accuracy: 0.8967254408060453]
[n_estimator: 5, max_depth: 250, accuracy: 0.8967254408060453]
[n_estimator: 25, max_depth: 5, accuracy: 0.7185138539042821]
[n_estimator: 25, max_depth: 10, accuracy: 0.8293450881612091]
[n_estimator: 25, max_depth: 25, accuracy: 0.9080604534005038]
[n_estimator: 25, max_depth: 50, accuracy: 0.9238035264483627]
[n_estimator: 25, max_depth: 100, accuracy: 0.9263224181360201]
[n_estimator: 25, max_depth: 250, accuracy: 0.9263224181360201]
[n_estimator: 50, max_depth: 5, accuracy: 0.7229219143576826]
[n_estimator: 50, max_depth: 10, accu

  score = y_true == y_pred



> Best n_estimator: 75 || Best max_depth: 100
> Best training score: 0.9828319420381162
> Best test score: 0.0
> Best held score: 0.929471032745592
[n_estimator: 5, max_depth: 5, accuracy: 0.6045340050377834]
[n_estimator: 5, max_depth: 10, accuracy: 0.7374055415617129]
[n_estimator: 5, max_depth: 25, accuracy: 0.8803526448362721]
[n_estimator: 5, max_depth: 50, accuracy: 0.8803526448362721]
[n_estimator: 5, max_depth: 100, accuracy: 0.889168765743073]
[n_estimator: 5, max_depth: 250, accuracy: 0.889168765743073]
[n_estimator: 25, max_depth: 5, accuracy: 0.6769521410579346]
[n_estimator: 25, max_depth: 10, accuracy: 0.8129722921914357]
[n_estimator: 25, max_depth: 25, accuracy: 0.9118387909319899]
[n_estimator: 25, max_depth: 50, accuracy: 0.9200251889168766]
[n_estimator: 25, max_depth: 100, accuracy: 0.9231738035264484]
[n_estimator: 25, max_depth: 250, accuracy: 0.9231738035264484]
[n_estimator: 50, max_depth: 5, accuracy: 0.716624685138539]
[n_estimator: 50, max_depth: 10, accurac

  score = y_true == y_pred



> Best n_estimator: 100 || Best max_depth: 50
> Best training score: 0.9828319420381162
> Best test score: 0.0
> Best held score: 0.926448362720403
[0.9809388783868935, 0.9812568908489526, 0.9823594266813671, 0.9828319420381162, 0.9828319420381162]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.9239294710327456, 0.9153652392947104, 0.9229219143576827, 0.929471032745592, 0.926448362720403]
0.9820438159986891
0.0
0.9236272040302268
0.926448362720403


In [53]:
print(bestPerformingModel.score(X_test.to_numpy(), y_test))

print(best_param_count)
print(bestne)
print(bestmd)
print(bestcr)
print(bestscore)

0.926448362720403
{'n_estimator': {'500': 2, '50': 1, '75': 1, '100': 1}, 'max_depth': {'100': 3, '50': 2}}
100
50
log_loss
0.9326196473551638


In [56]:
from sklearn.metrics import f1_score, precision_score, recall_score


def PrintMetrics(y_true, y_pred):
    print(f'Accuracy: {accuracy_score(y_true, y_pred)}')

    matrix = confusion_matrix(y_true, y_pred)
    print(matrix)

    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    f1 = f1_score(y_true, y_pred, average=None)

    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 score: {f1}')


PrintMetrics(y_test, bestPerformingModel.predict(X_test.to_numpy()))

Accuracy: 0.926448362720403
[[694   2   0   0   0   0   0   1  10]
 [ 13 432   0   4   0   0   0   2   6]
 [  1   1 137   0   0   2   0   0   0]
 [  0   3   0  82   0   0   0   5   2]
 [  0   1   0   0  30   0   1   0   0]
 [  0   7   1   0   0  91   0   0   0]
 [  1   3   0   3   0   3  44   0   3]
 [  4  10   2   2   0   0   0 194   3]
 [ 27  16   0   2   0   0   0   5 135]]
Precision: [0.93783784 0.90947368 0.97857143 0.88172043 1.         0.94791667
 0.97777778 0.93719807 0.8490566 ]
Recall: [0.98161245 0.9452954  0.97163121 0.89130435 0.9375     0.91919192
 0.77192982 0.90232558 0.72972973]
F1 score: [0.95922598 0.92703863 0.97508897 0.88648649 0.96774194 0.93333333
 0.8627451  0.91943128 0.78488372]
