# Trying a pipeline-based approach

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/train.csv")
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [2]:
story = []
for i in train_data['full_text']:
    story.append(i.lower().replace("\n", "").strip())

In [3]:
story = np.array(story).reshape(-1, 1)
story.shape

(3911, 1)

In [4]:
cohesion = np.array(train_data['cohesion']).astype(float)
syntax = np.array(train_data['syntax']).astype(float)
vocabulary = np.array(train_data['vocabulary']).astype(float)
phraseology = np.array(train_data['phraseology']).astype(float)
grammar = np.array(train_data['grammar']).astype(float)
conventions = np.array(train_data['conventions']).astype(float)
conventions.shape

(3911,)

In [5]:
test_data = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")
test_data

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [6]:
story_pred = []
for i in test_data['full_text']:
    story_pred.append(i.lower().replace("\n", "").strip())
story_pred = np.array(story_pred).reshape(-1, 1)

In [7]:
from atom import ATOMRegressor

atom1 = ATOMRegressor(story, cohesion, test_size = 0.33, random_state = 42, verbose = 2)
atom2 = ATOMRegressor(story, syntax, test_size = 0.33, random_state = 42, verbose = 2)
atom3 = ATOMRegressor(story, vocabulary, test_size = 0.33, random_state = 42, verbose = 2)
atom4 = ATOMRegressor(story, phraseology, test_size = 0.33, random_state = 42, verbose = 2)
atom5 = ATOMRegressor(story, grammar, test_size = 0.33, random_state = 42, verbose = 2)
atom6 = ATOMRegressor(story, conventions, test_size = 0.33, random_state = 42, verbose = 2)

Woodwork may not support Python 3.7 in next non-bugfix release.
Featuretools may not support Python 3.7 in next non-bugfix release.


Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 6 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 23 (0.4%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 1 (0.0%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 8 (0.2%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
----------------------------------

In [8]:
atom1.dataset

Unnamed: 0,corpus,target
0,the great artist michelangelo i thinking he is...,2.0
1,one of the things i want to acumplish in the f...,2.5
2,its always good to ask people how it feels or ...,3.0
3,in order for students to have a good experienc...,3.0
4,"dear, principal,i will like to star in saying ...",3.5
...,...,...
3906,i think an enjoyable educational activity for ...,4.0
3907,some people may say that oppocite but to start...,3.5
3908,"""unles you try to do somthing beyond what you ...",2.5
3909,having activities after school are good ideas ...,4.0


In [9]:
atom1.tokenize()
atom2.tokenize()
atom3.tokenize()
atom4.tokenize()
atom5.tokenize()
atom6.tokenize()

Tokenizing the corpus...
Tokenizing the corpus...
Tokenizing the corpus...
Tokenizing the corpus...
Tokenizing the corpus...
Tokenizing the corpus...


In [10]:
atom1.vectorize(strategy = 'tfidf')
atom2.vectorize(strategy = 'tfidf')
atom3.vectorize(strategy = 'tfidf')
atom4.vectorize(strategy = 'tfidf')
atom5.vectorize(strategy = 'tfidf')
atom6.vectorize(strategy = 'tfidf')

Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...


In [11]:
atom1.available_models()

Unnamed: 0,acronym,fullname,estimator,module,needs_scaling,accepts_sparse,supports_gpu
0,Dummy,Dummy Estimator,DummyRegressor,sklearn.dummy,False,False,False
1,GP,Gaussian Process,GaussianProcessRegressor,sklearn.gaussian_process._gpr,False,False,False
2,OLS,Ordinary Least Squares,LinearRegression,sklearn.linear_model._base,True,True,True
3,Ridge,Ridge Estimator,Ridge,sklearn.linear_model._ridge,True,True,True
4,Lasso,Lasso Regression,Lasso,sklearn.linear_model._coordinate_descent,True,True,True
5,EN,ElasticNet Regression,ElasticNet,sklearn.linear_model._coordinate_descent,True,True,True
6,Lars,Least Angle Regression,Lars,sklearn.linear_model._least_angle,True,False,True
7,BR,Bayesian Ridge,BayesianRidge,sklearn.linear_model._bayes,True,False,False
8,ARD,Automatic Relevant Determination,ARDRegression,sklearn.linear_model._bayes,True,False,False
9,Huber,Huber Regression,HuberRegressor,sklearn.linear_model._huber,True,False,False


In [12]:
atom1.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom2.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom3.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom4.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom5.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom6.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')



Models: Tree, Bag, ET, RF, AdaB, GBM
Metric: neg_mean_squared_error


Results for Decision Tree:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.6905
Time elapsed: 34.889s
-------------------------------------------------
Total time: 34.889s


Results for Bagging:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0665
Test evaluation --> neg_mean_squared_error: -0.3724
Time elapsed: 45.341s
-------------------------------------------------
Total time: 45.341s


Results for Extra-Trees:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.3148
Time elapsed: 3m:44s
-------------------------------------------------
Total time: 3m:44s


Results for Random Forest:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_er

In [13]:
atom1.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.62907,-0.217657,-3.0,-0.690504,-0.043243,-0.609043,-0.830966
Bag,-0.491473,-0.172058,-2.05,-0.372403,-0.023493,0.13221,-0.610248
ET,-0.45407,-0.159435,-1.95,-0.31483,-0.019958,0.26637,-0.561097
RF,-0.462717,-0.162639,-1.955,-0.330374,-0.020931,0.230149,-0.574781
AdaB,-0.463884,-0.162245,-1.933438,-0.33583,-0.021168,0.217435,-0.579508
GBM,-0.450396,-0.157791,-2.021971,-0.316166,-0.019919,0.263256,-0.562286


In [14]:
atom2.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.615116,-0.21913,-2.5,-0.64438,-0.04099,-0.55968,-0.802733
Bag,-0.476163,-0.17191,-2.1,-0.35144,-0.02291,0.149362,-0.592824
ET,-0.442915,-0.160575,-1.995,-0.304019,-0.020042,0.264141,-0.551379
RF,-0.45588,-0.164909,-2.04,-0.320382,-0.021029,0.224537,-0.566023
AdaB,-0.463273,-0.169895,-2.110998,-0.33155,-0.021999,0.197505,-0.575804
GBM,-0.443511,-0.16093,-1.88122,-0.306981,-0.020276,0.256972,-0.554059


In [15]:
atom3.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.532946,-0.174002,-2.5,-0.490504,-0.028537,-0.470053,-0.70036
Bag,-0.402946,-0.133461,-1.95,-0.263806,-0.01547,0.209366,-0.513621
ET,-0.385733,-0.128135,-1.91,-0.243917,-0.014381,0.268973,-0.49388
RF,-0.393027,-0.130382,-1.835,-0.251294,-0.014768,0.246864,-0.501293
AdaB,-0.404015,-0.135348,-1.856737,-0.262737,-0.015498,0.212569,-0.512579
GBM,-0.388365,-0.129225,-2.017709,-0.248103,-0.014645,0.25643,-0.498099


In [16]:
atom4.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.601938,-0.204575,-3.0,-0.611822,-0.037747,-0.437622,-0.78219
Bag,-0.465775,-0.161169,-1.75,-0.326157,-0.020243,0.233616,-0.571102
ET,-0.443674,-0.15438,-1.715,-0.295776,-0.018511,0.305002,-0.543853
RF,-0.448535,-0.155616,-1.755,-0.302574,-0.018844,0.289029,-0.550067
AdaB,-0.470015,-0.162839,-1.787571,-0.338824,-0.021063,0.203852,-0.582086
GBM,-0.435305,-0.151905,-1.720491,-0.290251,-0.018236,0.317985,-0.53875


In [17]:
atom5.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.672868,-0.233109,-2.5,-0.744186,-0.046867,-0.533732,-0.862662
Bag,-0.513256,-0.182173,-2.2,-0.399663,-0.025154,0.176314,-0.632189
ET,-0.482132,-0.170976,-2.14,-0.35149,-0.022152,0.275596,-0.592866
RF,-0.486167,-0.17259,-2.085,-0.360533,-0.022713,0.256959,-0.600444
AdaB,-0.510931,-0.186167,-2.167283,-0.391608,-0.02527,0.192915,-0.625786
GBM,-0.480369,-0.170856,-2.065322,-0.352067,-0.022263,0.274407,-0.593352


In [18]:
atom6.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.651938,-0.229933,-3.0,-0.714341,-0.045054,-0.555842,-0.845187
Bag,-0.504496,-0.180131,-2.25,-0.39605,-0.025312,0.137398,-0.629325
ET,-0.475872,-0.17129,-2.05,-0.349744,-0.022564,0.238254,-0.591392
RF,-0.487822,-0.175607,-2.05,-0.36537,-0.02359,0.204221,-0.604458
AdaB,-0.499622,-0.18334,-2.090667,-0.381559,-0.025036,0.168962,-0.617704
GBM,-0.472317,-0.169886,-1.938375,-0.3425,-0.022211,0.254032,-0.585235
