In [1]:
import numpy as np
import pandas as pd
from gen_features import FeatureGenerator
from model_loop import ModelLoop
from sklearn.model_selection import train_test_split



In [2]:
# test-train first; change to take a dataframe
# save those to to file
# call FeatureGenerator on training data; call FeatureGenerator.train() --> saves sparse matrices in attribute
# call FeatureGenerator on test data; call Feature Generator.transform(new_datafile)
# f.x --> training sparse matrix
# f.new_x --> testing sparse matrix

In [3]:
data_fp = '../data_cleaning/articles1.csv'
data = pd.read_csv(data_fp)

In [4]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
data = data.sample(n=1000)

In [6]:
X, y = train_test_split(data)

In [7]:
X.to_csv('data/X_full.csv')

In [8]:
y.to_csv('data/y_full.csv')

In [9]:
config = {
    'datafile': 'data/X_full.csv',
    'test_datafile': 'data/y_full.csv',
    'text_label': 'content',
    'y_label': 'label',
    'fts_to_try': ['TfidfVectorize'],
}

In [10]:
args = {k: v for k, v in config.items() if k != 'test_datafile'}

In [11]:
gen = FeatureGenerator(**args)

Creating feature:  TfidfVectorize
Parameters:  {'ngram_range': (1, 2)}
3000 features generated for 750 examples


In [12]:
gen.transform(config['test_datafile'])

3000 features generated for 250 examples


In [13]:
models = ['NB', 'RF', 'ET', 'LR', 'SVM']
iterations = 5
output_dir = 'output/'
ks = [0.05, 0.10]

In [14]:
loop = ModelLoop(gen.X_train, gen.X_test, gen.y_train, gen.y_test, models, iterations, output_dir,
                 ks = ks, method='csc')

In [15]:
loop.run()

Running NB.
Running RF.
Running ET.
Running LR.
Running SVM.


In [17]:
pd.read_csv('output/simple_report.csv', quotechar='"', skipinitialspace = True)

Unnamed: 0,model_id,model_type,iteration,auc,k,precision,recall,accuracy,params
0,0-0,NB,0,0.782813,0.05,0.769231,0.111111,0.668,{}
1,0-0,NB,0,0.782813,0.1,0.769231,0.111111,0.668,{}
2,1-0,RF,0,0.938715,0.05,1.0,0.144444,0.692,"{'max_depth': 60, 'max_features': 'sqrt', 'min..."
3,1-0,RF,0,0.938715,0.1,1.0,0.144444,0.692,"{'max_depth': 60, 'max_features': 'sqrt', 'min..."
4,2-0,ET,0,0.950486,0.05,1.0,0.144444,0.692,"{'criterion': 'entropy', 'max_depth': 10, 'max..."
5,2-0,ET,0,0.950486,0.1,1.0,0.144444,0.692,"{'criterion': 'entropy', 'max_depth': 10, 'max..."
6,3-0,LR,0,0.5,0.05,0.615385,0.088889,0.652,"{'C': 0.1, 'penalty': 'l1', 'random_state': 1}"
7,3-0,LR,0,0.5,0.1,0.615385,0.088889,0.652,"{'C': 0.1, 'penalty': 'l1', 'random_state': 1}"
8,4-0,SVM,0,0.945208,0.05,1.0,0.144444,0.692,"{'C': 1, 'kernel': 'linear', 'random_state': 1}"
9,4-0,SVM,0,0.945208,0.1,1.0,0.144444,0.692,"{'C': 1, 'kernel': 'linear', 'random_state': 1}"
