# Trying a pipeline-based approach

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("/content/drive/MyDrive/B565 Data/train.csv")
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [2]:
story = []
for i in train_data['full_text']:
    story.append(i.lower().replace("\n", "").strip())

In [3]:
story = np.array(story).reshape(-1, 1)
story.shape

(3911, 1)

In [4]:
cohesion = np.array(train_data['cohesion']).astype(float)
syntax = np.array(train_data['syntax']).astype(float)
vocabulary = np.array(train_data['vocabulary']).astype(float)
phraseology = np.array(train_data['phraseology']).astype(float)
grammar = np.array(train_data['grammar']).astype(float)
conventions = np.array(train_data['conventions']).astype(float)
conventions.shape

(3911,)

In [5]:
test_data = pd.read_csv("/content/drive/MyDrive/B565 Data/test.csv")
test_data

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [6]:
story_pred = []
for i in test_data['full_text']:
    story_pred.append(i.lower().replace("\n", "").strip())
story_pred = np.array(story_pred).reshape(-1, 1)

In [7]:
from atom import ATOMRegressor

atom1 = ATOMRegressor(story, cohesion, test_size = 0.33, random_state = 42, verbose = 2)
atom2 = ATOMRegressor(story, syntax, test_size = 0.33, random_state = 42, verbose = 2)
atom3 = ATOMRegressor(story, vocabulary, test_size = 0.33, random_state = 42, verbose = 2)
atom4 = ATOMRegressor(story, phraseology, test_size = 0.33, random_state = 42, verbose = 2)
atom5 = ATOMRegressor(story, grammar, test_size = 0.33, random_state = 42, verbose = 2)
atom6 = ATOMRegressor(story, conventions, test_size = 0.33, random_state = 42, verbose = 2)

Woodwork may not support Python 3.7 in next non-bugfix release.
Featuretools may not support Python 3.7 in next non-bugfix release.


Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 6 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 23 (0.4%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 1 (0.0%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 8 (0.2%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
----------------------------------

In [8]:
atom1.dataset

Unnamed: 0,corpus,target
0,the great artist michelangelo i thinking he is...,2.0
1,one of the things i want to acumplish in the f...,2.5
2,its always good to ask people how it feels or ...,3.0
3,in order for students to have a good experienc...,3.0
4,"dear, principal,i will like to star in saying ...",3.5
...,...,...
3906,i think an enjoyable educational activity for ...,4.0
3907,some people may say that oppocite but to start...,3.5
3908,"""unles you try to do somthing beyond what you ...",2.5
3909,having activities after school are good ideas ...,4.0


In [9]:
atom1.tokenize(quadgram_freq = 100)
atom2.tokenize(quadgram_freq = 100)
atom3.tokenize(quadgram_freq = 100)
atom4.tokenize(quadgram_freq = 100)
atom5.tokenize(quadgram_freq = 100)
atom6.tokenize(quadgram_freq = 100)

Tokenizing the corpus...
 --> Creating 140 quadgrams on 22517 locations.
Tokenizing the corpus...
 --> Creating 140 quadgrams on 22517 locations.
Tokenizing the corpus...
 --> Creating 140 quadgrams on 22517 locations.
Tokenizing the corpus...
 --> Creating 140 quadgrams on 22517 locations.
Tokenizing the corpus...
 --> Creating 140 quadgrams on 22517 locations.
Tokenizing the corpus...
 --> Creating 140 quadgrams on 22517 locations.


In [10]:
atom1.vectorize(strategy = 'tfidf')
atom2.vectorize(strategy = 'tfidf')
atom3.vectorize(strategy = 'tfidf')
atom4.vectorize(strategy = 'tfidf')
atom5.vectorize(strategy = 'tfidf')
atom6.vectorize(strategy = 'tfidf')

Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...


In [11]:
atom1.available_models()

Unnamed: 0,acronym,fullname,estimator,module,needs_scaling,accepts_sparse,supports_gpu
0,Dummy,Dummy Estimator,DummyRegressor,sklearn.dummy,False,False,False
1,GP,Gaussian Process,GaussianProcessRegressor,sklearn.gaussian_process._gpr,False,False,False
2,OLS,Ordinary Least Squares,LinearRegression,sklearn.linear_model._base,True,True,True
3,Ridge,Ridge Estimator,Ridge,sklearn.linear_model._ridge,True,True,True
4,Lasso,Lasso Regression,Lasso,sklearn.linear_model._coordinate_descent,True,True,True
5,EN,ElasticNet Regression,ElasticNet,sklearn.linear_model._coordinate_descent,True,True,True
6,Lars,Least Angle Regression,Lars,sklearn.linear_model._least_angle,True,False,True
7,BR,Bayesian Ridge,BayesianRidge,sklearn.linear_model._bayes,True,False,False
8,ARD,Automatic Relevant Determination,ARDRegression,sklearn.linear_model._bayes,True,False,False
9,Huber,Huber Regression,HuberRegressor,sklearn.linear_model._huber,True,False,False


In [12]:
atom1.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom2.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom3.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom4.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom5.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom6.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')


Models: Tree, Bag, ET, RF, AdaB, GBM
Metric: neg_mean_squared_error


Results for Decision Tree:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.6657
Time elapsed: 39.289s
-------------------------------------------------
Total time: 39.289s


Results for Bagging:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0641
Test evaluation --> neg_mean_squared_error: -0.3541
Time elapsed: 42.767s
-------------------------------------------------
Total time: 42.768s


Results for Extra-Trees:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.3157
Time elapsed: 3m:25s
-------------------------------------------------
Total time: 3m:25s


Results for Random Forest:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_er

In [13]:
atom1.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.634496,-0.217529,-2.5,-0.665698,-0.042082,-0.551238,-0.815903
Bag,-0.476899,-0.166728,-2.05,-0.35414,-0.022292,0.174768,-0.595096
ET,-0.453349,-0.159193,-2.06,-0.315652,-0.02001,0.264453,-0.561829
RF,-0.458713,-0.161096,-1.95,-0.325377,-0.020611,0.241792,-0.570418
AdaB,-0.460475,-0.162417,-1.861219,-0.332299,-0.021102,0.225662,-0.576454
GBM,-0.455385,-0.159819,-1.846675,-0.317271,-0.020094,0.260682,-0.563268


In [14]:
atom2.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.605814,-0.21288,-3.0,-0.637016,-0.040737,-0.541855,-0.798133
Bag,-0.468876,-0.170171,-2.05,-0.342758,-0.022588,0.170377,-0.585455
ET,-0.445093,-0.161072,-1.955,-0.303053,-0.019955,0.26648,-0.550502
RF,-0.454585,-0.164601,-1.85,-0.314861,-0.020718,0.237899,-0.561125
AdaB,-0.465133,-0.170755,-2.137664,-0.327945,-0.02181,0.20623,-0.572665
GBM,-0.443803,-0.16082,-1.808989,-0.302865,-0.019975,0.266934,-0.550332


In [15]:
atom3.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.531395,-0.174524,-2.5,-0.492442,-0.028173,-0.475862,-0.701742
Bag,-0.410155,-0.136219,-2.0,-0.273942,-0.016107,0.178989,-0.523395
ET,-0.385213,-0.128151,-1.885,-0.243379,-0.014379,0.270588,-0.493334
RF,-0.393942,-0.130854,-1.97,-0.251689,-0.014825,0.245683,-0.501686
AdaB,-0.40327,-0.135501,-2.039683,-0.259726,-0.015396,0.221593,-0.509634
GBM,-0.389598,-0.129762,-2.011416,-0.247271,-0.014608,0.258922,-0.497264


In [16]:
atom4.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.633333,-0.215953,-2.5,-0.663566,-0.040966,-0.559207,-0.814596
Bag,-0.472868,-0.163427,-2.1,-0.335217,-0.020808,0.212327,-0.578979
ET,-0.442039,-0.153903,-1.605,-0.294372,-0.018444,0.308303,-0.54256
RF,-0.448341,-0.155201,-1.855,-0.302259,-0.018781,0.289769,-0.549781
AdaB,-0.466331,-0.161792,-1.728477,-0.328095,-0.020457,0.229061,-0.572796
GBM,-0.435729,-0.151983,-1.630755,-0.289469,-0.018199,0.319824,-0.538023


In [17]:
atom5.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.672093,-0.234408,-2.5,-0.75155,-0.047214,-0.54891,-0.86692
Bag,-0.502829,-0.176712,-2.25,-0.393452,-0.024576,0.189115,-0.627257
ET,-0.480457,-0.17009,-1.995,-0.349014,-0.021959,0.280699,-0.590774
RF,-0.482174,-0.170969,-2.205,-0.354655,-0.022309,0.269073,-0.595529
AdaB,-0.510356,-0.185681,-2.266144,-0.388533,-0.025019,0.199253,-0.623324
GBM,-0.477996,-0.169993,-2.162893,-0.351214,-0.022213,0.276165,-0.592633


In [18]:
atom6.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.653876,-0.229873,-2.5,-0.702907,-0.045173,-0.530939,-0.838395
Bag,-0.507403,-0.181801,-2.15,-0.398893,-0.02555,0.131206,-0.63158
ET,-0.475058,-0.170608,-2.21,-0.349931,-0.022526,0.237847,-0.591549
RF,-0.486512,-0.175095,-2.085,-0.366849,-0.023644,0.200999,-0.605681
AdaB,-0.494362,-0.179764,-2.096509,-0.37644,-0.024554,0.180109,-0.613547
GBM,-0.470892,-0.169299,-1.916166,-0.345297,-0.022339,0.24794,-0.58762
