# Trying a pipeline-based approach

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [2]:
story = []
for i in train_data['full_text']:
    story.append(i.lower().replace("\n", "").strip())

In [3]:
story = np.array(story).reshape(-1, 1)
story.shape

(3911, 1)

In [4]:
cohesion = np.array(train_data['cohesion']).astype(float)
syntax = np.array(train_data['syntax']).astype(float)
vocabulary = np.array(train_data['vocabulary']).astype(float)
phraseology = np.array(train_data['phraseology']).astype(float)
grammar = np.array(train_data['grammar']).astype(float)
conventions = np.array(train_data['conventions']).astype(float)
conventions.shape

(3911,)

In [5]:
test_data = pd.read_csv("test.csv")
test_data

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [6]:
story_pred = []
for i in test_data['full_text']:
    story_pred.append(i.lower().replace("\n", "").strip())
story_pred = np.array(story_pred).reshape(-1, 1)

In [7]:
from atom import ATOMRegressor

atom1 = ATOMRegressor(story, cohesion, test_size = 0.33, random_state = 42, verbose = 2)
atom2 = ATOMRegressor(story, syntax, test_size = 0.33, random_state = 42, verbose = 2)
atom3 = ATOMRegressor(story, vocabulary, test_size = 0.33, random_state = 42, verbose = 2)
atom4 = ATOMRegressor(story, phraseology, test_size = 0.33, random_state = 42, verbose = 2)
atom5 = ATOMRegressor(story, grammar, test_size = 0.33, random_state = 42, verbose = 2)
atom6 = ATOMRegressor(story, conventions, test_size = 0.33, random_state = 42, verbose = 2)

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 4 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 17 (0.3%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 23 (0.4%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 3 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
---------------------------------

In [8]:
atom1.dataset

Unnamed: 0,corpus,target
0,i agreed that the greater of us lies and not i...,3.5
1,the most enjoyable educational activity would ...,3.0
2,people always argue when it comes to does posi...,3.0
3,i dont like becuase the student forget all inf...,3.0
4,"a first impression is the key to succes, becau...",3.0
...,...,...
3906,why does most people want multiple advice when...,3.5
3907,make sure you have your seat belts. that is th...,2.5
3908,"people can check out books, shop and play game...",3.0
3909,doing projects at school is really fun when it...,4.0


In [9]:
atom1.tokenize()
atom2.tokenize()
atom3.tokenize()
atom4.tokenize()
atom5.tokenize()
atom6.tokenize()

Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...


In [10]:
atom1.vectorize(strategy = 'bow')
atom2.vectorize(strategy = 'bow')
atom3.vectorize(strategy = 'bow')
atom4.vectorize(strategy = 'bow')
atom5.vectorize(strategy = 'bow')
atom6.vectorize(strategy = 'bow')

Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...


In [11]:
atom1.available_models()

Unnamed: 0,acronym,model,estimator,module,needs_scaling,accepts_sparse,has_validation,supports_engines
0,AdaB,AdaBoost,AdaBoostRegressor,sklearnensemble,False,True,False,sklearn
1,ARD,AutomaticRelevanceDetermination,ARDRegression,sklearnlinear_model,True,False,False,sklearn
2,Bag,Bagging,BaggingRegressor,sklearnensemble,False,True,False,sklearn
3,BR,BayesianRidge,BayesianRidge,sklearnlinear_model,True,False,False,sklearn
4,CatB,CatBoost,CatBoostRegressor,catboostcatboost,True,True,True,catboost
5,Tree,DecisionTree,DecisionTreeRegressor,sklearntree,False,True,False,sklearn
6,Dummy,Dummy,DummyRegressor,sklearndummy,False,False,False,sklearn
7,EN,ElasticNet,ElasticNet,sklearnlinear_model,True,True,False,"sklearn, sklearnex, cuml"
8,ETree,ExtraTree,ExtraTreeRegressor,sklearntree,False,True,False,sklearn
9,ET,ExtraTrees,ExtraTreesRegressor,sklearnensemble,False,True,False,sklearn


In [12]:
atom1.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom2.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom3.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom4.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom5.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom6.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')



Models: Tree, Bag, ET, RF, AdaB, GBM
Metric: neg_mean_squared_error


Results for DecisionTree:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.6767
Time elapsed: 19.075s
-------------------------------------------------
Total time: 19.075s


Results for Bagging:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0638
Test evaluation --> neg_mean_squared_error: -0.3808
Time elapsed: 30.389s
-------------------------------------------------
Total time: 30.389s


Results for ExtraTrees:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.3304
Time elapsed: 04m:12s
-------------------------------------------------
Total time: 04m:12s


Results for RandomForest:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_err

In [13]:
atom1.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.6357,-0.2182,-3.0,-0.6767,-0.5393,-0.8226
Bag,-0.4994,-0.1745,-1.95,-0.3808,0.1338,-0.6171
ET,-0.4652,-0.163,-1.95,-0.3304,0.2485,-0.5748
RF,-0.4701,-0.1646,-1.93,-0.3367,0.2343,-0.5802
AdaB,-0.4848,-0.1694,-2.1254,-0.3534,0.1961,-0.5945
GBM,-0.4626,-0.1629,-1.9249,-0.3256,0.2594,-0.5706


In [14]:
atom2.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.5926,-0.2076,-3.0,-0.6064,-0.4638,-0.7787
Bag,-0.4675,-0.1671,-2.2,-0.3423,0.1737,-0.5851
ET,-0.4393,-0.1578,-2.125,-0.3028,0.269,-0.5503
RF,-0.4445,-0.1594,-2.07,-0.3115,0.2482,-0.5581
AdaB,-0.4613,-0.1694,-2.1317,-0.3396,0.1801,-0.5828
GBM,-0.4409,-0.159,-2.0434,-0.3051,0.2636,-0.5523


In [15]:
atom3.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.5209,-0.1685,-2.5,-0.5035,-0.4335,-0.7096
Bag,-0.4137,-0.1341,-2.25,-0.2832,0.1936,-0.5322
ET,-0.39,-0.127,-2.0,-0.2568,0.2689,-0.5067
RF,-0.398,-0.1295,-2.155,-0.2656,0.2438,-0.5153
AdaB,-0.4046,-0.1321,-2.1172,-0.2753,0.216,-0.5247
GBM,-0.3877,-0.1268,-1.9163,-0.2521,0.2821,-0.5021


In [16]:
atom4.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.6182,-0.209,-2.5,-0.6254,-0.4652,-0.7908
Bag,-0.4786,-0.1645,-2.15,-0.3545,0.1693,-0.5954
ET,-0.4419,-0.1527,-1.8,-0.3048,0.286,-0.5521
RF,-0.4522,-0.156,-1.84,-0.3181,0.2547,-0.564
AdaB,-0.4658,-0.1613,-1.9466,-0.3421,0.1984,-0.5849
GBM,-0.4412,-0.153,-1.9329,-0.3032,0.2897,-0.5506


In [17]:
atom5.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.6903,-0.2415,-3.0,-0.7913,-0.6198,-0.8895
Bag,-0.5261,-0.1868,-2.15,-0.419,0.1423,-0.6473
ET,-0.4887,-0.173,-1.985,-0.3568,0.2695,-0.5974
RF,-0.5044,-0.1784,-2.015,-0.378,0.2263,-0.6148
AdaB,-0.5279,-0.1903,-2.0703,-0.4179,0.1446,-0.6464
GBM,-0.4866,-0.1741,-2.089,-0.3566,0.2701,-0.5971


In [18]:
atom6.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.6822,-0.2385,-2.5,-0.7581,-0.6329,-0.8707
Bag,-0.5202,-0.1832,-2.15,-0.4173,0.1013,-0.646
ET,-0.4833,-0.1727,-1.88,-0.3602,0.2242,-0.6002
RF,-0.4905,-0.175,-1.94,-0.375,0.1923,-0.6124
AdaB,-0.5095,-0.1838,-2.0764,-0.4014,0.1353,-0.6336
GBM,-0.4771,-0.1698,-2.037,-0.3514,0.243,-0.5928
