# Trying a pipeline-based approach

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("/content/drive/MyDrive/B565 Data/train.csv")
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [2]:
story = []
for i in train_data['full_text']:
    story.append(i.lower().replace("\n", "").strip())

In [3]:
story = np.array(story).reshape(-1, 1)
story.shape

(3911, 1)

In [4]:
cohesion = np.array(train_data['cohesion']).astype(float)
syntax = np.array(train_data['syntax']).astype(float)
vocabulary = np.array(train_data['vocabulary']).astype(float)
phraseology = np.array(train_data['phraseology']).astype(float)
grammar = np.array(train_data['grammar']).astype(float)
conventions = np.array(train_data['conventions']).astype(float)
conventions.shape

(3911,)

In [5]:
test_data = pd.read_csv("/content/drive/MyDrive/B565 Data/test.csv")
test_data

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [6]:
story_pred = []
for i in test_data['full_text']:
    story_pred.append(i.lower().replace("\n", "").strip())
story_pred = np.array(story_pred).reshape(-1, 1)

In [7]:
from atom import ATOMRegressor

atom1 = ATOMRegressor(story, cohesion, test_size = 0.33, random_state = 42, verbose = 2)
atom2 = ATOMRegressor(story, syntax, test_size = 0.33, random_state = 42, verbose = 2)
atom3 = ATOMRegressor(story, vocabulary, test_size = 0.33, random_state = 42, verbose = 2)
atom4 = ATOMRegressor(story, phraseology, test_size = 0.33, random_state = 42, verbose = 2)
atom5 = ATOMRegressor(story, grammar, test_size = 0.33, random_state = 42, verbose = 2)
atom6 = ATOMRegressor(story, conventions, test_size = 0.33, random_state = 42, verbose = 2)

Woodwork may not support Python 3.7 in next non-bugfix release.
Featuretools may not support Python 3.7 in next non-bugfix release.


Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 6 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 23 (0.4%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 1 (0.0%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 8 (0.2%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
----------------------------------

In [8]:
atom1.dataset

Unnamed: 0,corpus,target
0,the great artist michelangelo i thinking he is...,2.0
1,one of the things i want to acumplish in the f...,2.5
2,its always good to ask people how it feels or ...,3.0
3,in order for students to have a good experienc...,3.0
4,"dear, principal,i will like to star in saying ...",3.5
...,...,...
3906,i think an enjoyable educational activity for ...,4.0
3907,some people may say that oppocite but to start...,3.5
3908,"""unles you try to do somthing beyond what you ...",2.5
3909,having activities after school are good ideas ...,4.0


In [9]:
atom1.textclean()
atom2.textclean()
atom3.textclean()
atom4.textclean()
atom5.textclean()
atom6.textclean()

Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dro

In [10]:
atom1.tokenize(quadgram_freq = 100)
atom2.tokenize(quadgram_freq = 100)
atom3.tokenize(quadgram_freq = 100)
atom4.tokenize(quadgram_freq = 100)
atom5.tokenize(quadgram_freq = 100)
atom6.tokenize(quadgram_freq = 100)

Tokenizing the corpus...
 --> Creating 116 quadgrams on 17655 locations.
Tokenizing the corpus...
 --> Creating 116 quadgrams on 17655 locations.
Tokenizing the corpus...
 --> Creating 116 quadgrams on 17655 locations.
Tokenizing the corpus...
 --> Creating 116 quadgrams on 17655 locations.
Tokenizing the corpus...
 --> Creating 116 quadgrams on 17655 locations.
Tokenizing the corpus...
 --> Creating 116 quadgrams on 17655 locations.


In [11]:
atom1.vectorize(strategy = 'tfidf')
atom2.vectorize(strategy = 'tfidf')
atom3.vectorize(strategy = 'tfidf')
atom4.vectorize(strategy = 'tfidf')
atom5.vectorize(strategy = 'tfidf')
atom6.vectorize(strategy = 'tfidf')

Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...


In [12]:
atom1.available_models()

Unnamed: 0,acronym,fullname,estimator,module,needs_scaling,accepts_sparse,supports_gpu
0,Dummy,Dummy Estimator,DummyRegressor,sklearn.dummy,False,False,False
1,GP,Gaussian Process,GaussianProcessRegressor,sklearn.gaussian_process._gpr,False,False,False
2,OLS,Ordinary Least Squares,LinearRegression,sklearn.linear_model._base,True,True,True
3,Ridge,Ridge Estimator,Ridge,sklearn.linear_model._ridge,True,True,True
4,Lasso,Lasso Regression,Lasso,sklearn.linear_model._coordinate_descent,True,True,True
5,EN,ElasticNet Regression,ElasticNet,sklearn.linear_model._coordinate_descent,True,True,True
6,Lars,Least Angle Regression,Lars,sklearn.linear_model._least_angle,True,False,True
7,BR,Bayesian Ridge,BayesianRidge,sklearn.linear_model._bayes,True,False,False
8,ARD,Automatic Relevant Determination,ARDRegression,sklearn.linear_model._bayes,True,False,False
9,Huber,Huber Regression,HuberRegressor,sklearn.linear_model._huber,True,False,False


In [13]:
atom1.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom2.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom3.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom4.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom5.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom6.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')


Models: Tree, Bag, ET, RF, AdaB, GBM
Metric: neg_mean_squared_error


Results for Decision Tree:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.6816
Time elapsed: 54.045s
-------------------------------------------------
Total time: 54.045s


Results for Bagging:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0676
Test evaluation --> neg_mean_squared_error: -0.3591
Time elapsed: 1m:08s
-------------------------------------------------
Total time: 1m:08s


Results for Extra-Trees:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.3187
Time elapsed: 4m:34s
-------------------------------------------------
Total time: 4m:34s


Results for Random Forest:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_erro

In [14]:
atom1.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.642248,-0.22129,-2.5,-0.681589,-0.043112,-0.58827,-0.825584
Bag,-0.476512,-0.166987,-2.2,-0.359093,-0.022634,0.163225,-0.599244
ET,-0.455531,-0.159853,-2.075,-0.318671,-0.020169,0.257418,-0.56451
RF,-0.461895,-0.162541,-2.055,-0.330068,-0.020917,0.230861,-0.574515
AdaB,-0.464945,-0.162634,-1.964567,-0.33358,-0.021015,0.222678,-0.577563
GBM,-0.452458,-0.158997,-2.099594,-0.321129,-0.020303,0.251691,-0.566682


In [15]:
atom2.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.622868,-0.220346,-2.5,-0.662597,-0.041978,-0.603773,-0.814001
Bag,-0.46593,-0.167998,-1.95,-0.339979,-0.022194,0.177104,-0.583077
ET,-0.445806,-0.161769,-1.975,-0.308463,-0.020328,0.253386,-0.555394
RF,-0.452039,-0.16401,-2.04,-0.317035,-0.020875,0.232638,-0.563058
AdaB,-0.461868,-0.168815,-2.227877,-0.329197,-0.02181,0.203201,-0.573757
GBM,-0.448173,-0.16278,-1.997598,-0.308444,-0.020403,0.253432,-0.555377


In [16]:
atom3.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.54031,-0.176166,-2.5,-0.508527,-0.028984,-0.52407,-0.713111
Bag,-0.410194,-0.136096,-1.85,-0.272258,-0.01597,0.184036,-0.521783
ET,-0.38807,-0.12912,-1.915,-0.246279,-0.014538,0.261894,-0.496265
RF,-0.39443,-0.131071,-1.77,-0.252535,-0.014872,0.243145,-0.502529
AdaB,-0.404947,-0.135571,-2.0,-0.258255,-0.015298,0.226004,-0.508188
GBM,-0.393644,-0.130908,-2.0011,-0.250977,-0.014792,0.247814,-0.500977


In [17]:
atom4.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.617054,-0.21032,-2.5,-0.635659,-0.039499,-0.493633,-0.797282
Bag,-0.470426,-0.163265,-1.95,-0.338021,-0.021001,0.205738,-0.581396
ET,-0.444597,-0.15529,-1.69,-0.296685,-0.018661,0.302868,-0.544688
RF,-0.452663,-0.157543,-1.735,-0.310441,-0.019388,0.270544,-0.557172
AdaB,-0.466296,-0.161982,-1.703016,-0.329856,-0.020594,0.224923,-0.574331
GBM,-0.440654,-0.153773,-1.703164,-0.295322,-0.018555,0.306071,-0.543435


In [18]:
atom5.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.689922,-0.237263,-3.0,-0.796124,-0.050377,-0.640774,-0.892258
Bag,-0.511822,-0.181806,-2.25,-0.396657,-0.024999,0.182509,-0.629807
ET,-0.484097,-0.172176,-2.05,-0.352096,-0.022279,0.274348,-0.593376
RF,-0.49405,-0.175591,-2.1,-0.367269,-0.023195,0.243077,-0.606027
AdaB,-0.511815,-0.187431,-2.045949,-0.390846,-0.025302,0.194485,-0.625177
GBM,-0.481966,-0.171714,-2.066204,-0.35402,-0.022393,0.270382,-0.594996


In [19]:
atom6.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.657364,-0.232036,-3.0,-0.726357,-0.046129,-0.582012,-0.852266
Bag,-0.516783,-0.18512,-2.45,-0.413444,-0.026468,0.099515,-0.642996
ET,-0.47707,-0.171764,-2.125,-0.353041,-0.022775,0.231072,-0.594173
RF,-0.490609,-0.176414,-2.025,-0.370558,-0.023889,0.192921,-0.608735
AdaB,-0.502179,-0.184471,-1.983671,-0.381165,-0.025019,0.169819,-0.617386
GBM,-0.476274,-0.170597,-2.113812,-0.352164,-0.022658,0.232983,-0.593434
