# Trying a pipeline-based approach

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [2]:
story = []
for i in train_data['full_text']:
    story.append(i.lower().replace("\n", "").strip())

In [3]:
story = np.array(story).reshape(-1, 1)
story.shape

(3911, 1)

In [4]:
cohesion = np.array(train_data['cohesion']).astype(float)
syntax = np.array(train_data['syntax']).astype(float)
vocabulary = np.array(train_data['vocabulary']).astype(float)
phraseology = np.array(train_data['phraseology']).astype(float)
grammar = np.array(train_data['grammar']).astype(float)
conventions = np.array(train_data['conventions']).astype(float)
conventions.shape

(3911,)

In [5]:
test_data = pd.read_csv("test.csv")
test_data

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [6]:
story_pred = []
for i in test_data['full_text']:
    story_pred.append(i.lower().replace("\n", "").strip())
story_pred = np.array(story_pred).reshape(-1, 1)

In [7]:
from atom import ATOMRegressor

atom1 = ATOMRegressor(story, cohesion, test_size = 0.33, random_state = 42, verbose = 2)
atom2 = ATOMRegressor(story, syntax, test_size = 0.33, random_state = 42, verbose = 2)
atom3 = ATOMRegressor(story, vocabulary, test_size = 0.33, random_state = 42, verbose = 2)
atom4 = ATOMRegressor(story, phraseology, test_size = 0.33, random_state = 42, verbose = 2)
atom5 = ATOMRegressor(story, grammar, test_size = 0.33, random_state = 42, verbose = 2)
atom6 = ATOMRegressor(story, conventions, test_size = 0.33, random_state = 42, verbose = 2)

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 4 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 17 (0.3%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 23 (0.4%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 3 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
---------------------------------

In [8]:
atom1.dataset

Unnamed: 0,corpus,target
0,i agreed that the greater of us lies and not i...,3.5
1,the most enjoyable educational activity would ...,3.0
2,people always argue when it comes to does posi...,3.0
3,i dont like becuase the student forget all inf...,3.0
4,"a first impression is the key to succes, becau...",3.0
...,...,...
3906,why does most people want multiple advice when...,3.5
3907,make sure you have your seat belts. that is th...,2.5
3908,"people can check out books, shop and play game...",3.0
3909,doing projects at school is really fun when it...,4.0


In [9]:
atom1.textclean()
atom2.textclean()
atom3.textclean()
atom4.textclean()
atom5.textclean()
atom6.textclean()

Fitting TextCleaner...
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Fitting TextCleaner...
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Fitting TextCleaner...
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tag

In [10]:
atom1.tokenize()
atom2.tokenize()
atom3.tokenize()
atom4.tokenize()
atom5.tokenize()
atom6.tokenize()

Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...
Fitting Tokenizer...
Tokenizing the corpus...


In [11]:
atom1.vectorize(strategy = 'bow')
atom2.vectorize(strategy = 'bow')
atom3.vectorize(strategy = 'bow')
atom4.vectorize(strategy = 'bow')
atom5.vectorize(strategy = 'bow')
atom6.vectorize(strategy = 'bow')

Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...


In [12]:
atom1.available_models()

Unnamed: 0,acronym,model,estimator,module,needs_scaling,accepts_sparse,has_validation,supports_engines
0,AdaB,AdaBoost,AdaBoostRegressor,sklearnensemble,False,True,False,sklearn
1,ARD,AutomaticRelevanceDetermination,ARDRegression,sklearnlinear_model,True,False,False,sklearn
2,Bag,Bagging,BaggingRegressor,sklearnensemble,False,True,False,sklearn
3,BR,BayesianRidge,BayesianRidge,sklearnlinear_model,True,False,False,sklearn
4,CatB,CatBoost,CatBoostRegressor,catboostcatboost,True,True,True,catboost
5,Tree,DecisionTree,DecisionTreeRegressor,sklearntree,False,True,False,sklearn
6,Dummy,Dummy,DummyRegressor,sklearndummy,False,False,False,sklearn
7,EN,ElasticNet,ElasticNet,sklearnlinear_model,True,True,False,"sklearn, sklearnex, cuml"
8,ETree,ExtraTree,ExtraTreeRegressor,sklearntree,False,True,False,sklearn
9,ET,ExtraTrees,ExtraTreesRegressor,sklearnensemble,False,True,False,sklearn


In [13]:
atom1.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom2.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom3.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom4.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom5.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom6.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')



Models: Tree, Bag, ET, RF, AdaB, GBM
Metric: neg_mean_squared_error


Results for DecisionTree:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.6886
Time elapsed: 29.636s
-------------------------------------------------
Total time: 29.636s


Results for Bagging:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0654
Test evaluation --> neg_mean_squared_error: -0.3627
Time elapsed: 44.494s
-------------------------------------------------
Total time: 44.494s


Results for ExtraTrees:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.3336
Time elapsed: 05m:23s
-------------------------------------------------
Total time: 05m:23s


Results for RandomForest:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_err

In [14]:
atom1.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.636,-0.218,-3.0,-0.6886,-0.5662,-0.8298
Bag,-0.4848,-0.1698,-1.95,-0.3627,0.1749,-0.6023
ET,-0.4672,-0.1641,-1.89,-0.3336,0.2412,-0.5776
RF,-0.4715,-0.1654,-1.93,-0.3384,0.2302,-0.5817
AdaB,-0.4875,-0.1699,-2.0726,-0.3586,0.1844,-0.5988
GBM,-0.4679,-0.1645,-1.8261,-0.3329,0.2427,-0.577


In [15]:
atom2.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.6078,-0.214,-3.0,-0.6287,-0.5175,-0.7929
Bag,-0.4667,-0.1665,-1.95,-0.3428,0.1726,-0.5855
ET,-0.4368,-0.1574,-2.18,-0.3042,0.2656,-0.5516
RF,-0.4453,-0.16,-2.135,-0.3142,0.2415,-0.5605
AdaB,-0.4645,-0.1707,-2.0445,-0.3363,0.1881,-0.5799
GBM,-0.4413,-0.1594,-2.1355,-0.309,0.2542,-0.5559


In [16]:
atom3.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.5264,-0.1697,-2.5,-0.5101,-0.4523,-0.7142
Bag,-0.4094,-0.133,-2.35,-0.2792,0.2049,-0.5284
ET,-0.3897,-0.127,-2.075,-0.2556,0.2723,-0.5056
RF,-0.3977,-0.1294,-2.1,-0.2637,0.2491,-0.5136
AdaB,-0.405,-0.1322,-2.0041,-0.2763,0.2133,-0.5256
GBM,-0.3888,-0.1274,-2.0607,-0.2544,0.2756,-0.5044


In [17]:
atom4.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.6147,-0.2073,-3.0,-0.6376,-0.4938,-0.7985
Bag,-0.4723,-0.1619,-2.15,-0.3542,0.17,-0.5952
ET,-0.4428,-0.1529,-1.76,-0.3049,0.2857,-0.5522
RF,-0.4536,-0.1563,-1.89,-0.3203,0.2495,-0.566
AdaB,-0.4716,-0.1641,-1.9511,-0.3424,0.1978,-0.5852
GBM,-0.4414,-0.153,-1.8958,-0.3041,0.2874,-0.5515


In [18]:
atom5.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.674,-0.2341,-3.0,-0.7428,-0.5206,-0.8619
Bag,-0.5183,-0.1834,-2.2,-0.4101,0.1605,-0.6404
ET,-0.4923,-0.1743,-2.09,-0.3641,0.2546,-0.6034
RF,-0.504,-0.1782,-2.21,-0.3777,0.2267,-0.6146
AdaB,-0.5328,-0.1948,-2.1613,-0.4182,0.144,-0.6467
GBM,-0.4858,-0.1742,-2.0356,-0.3572,0.2688,-0.5976


In [19]:
atom6.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,r2,neg_root_mean_squared_error
Tree,-0.6671,-0.2325,-3.0,-0.7502,-0.6158,-0.8661
Bag,-0.5107,-0.1818,-2.05,-0.4069,0.1236,-0.6379
ET,-0.4818,-0.1717,-1.98,-0.3611,0.2223,-0.6009
RF,-0.4938,-0.1759,-2.025,-0.3765,0.1891,-0.6136
AdaB,-0.5121,-0.1856,-2.0609,-0.3984,0.1418,-0.6312
GBM,-0.485,-0.1721,-2.126,-0.362,0.2202,-0.6017
