# Trying a pipeline-based approach

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("/content/drive/MyDrive/B565 Data/train.csv")
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [2]:
story = []
for i in train_data['full_text']:
    story.append(i.lower().replace("\n", "").strip())

In [3]:
story = np.array(story).reshape(-1, 1)
story.shape

(3911, 1)

In [4]:
cohesion = np.array(train_data['cohesion']).astype(float)
syntax = np.array(train_data['syntax']).astype(float)
vocabulary = np.array(train_data['vocabulary']).astype(float)
phraseology = np.array(train_data['phraseology']).astype(float)
grammar = np.array(train_data['grammar']).astype(float)
conventions = np.array(train_data['conventions']).astype(float)
conventions.shape

(3911,)

In [5]:
test_data = pd.read_csv("/content/drive/MyDrive/B565 Data/test.csv")
test_data

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [6]:
story_pred = []
for i in test_data['full_text']:
    story_pred.append(i.lower().replace("\n", "").strip())
story_pred = np.array(story_pred).reshape(-1, 1)

In [7]:
from atom import ATOMRegressor

atom1 = ATOMRegressor(story, cohesion, test_size = 0.33, random_state = 42, verbose = 2)
atom2 = ATOMRegressor(story, syntax, test_size = 0.33, random_state = 42, verbose = 2)
atom3 = ATOMRegressor(story, vocabulary, test_size = 0.33, random_state = 42, verbose = 2)
atom4 = ATOMRegressor(story, phraseology, test_size = 0.33, random_state = 42, verbose = 2)
atom5 = ATOMRegressor(story, grammar, test_size = 0.33, random_state = 42, verbose = 2)
atom6 = ATOMRegressor(story, conventions, test_size = 0.33, random_state = 42, verbose = 2)

Woodwork may not support Python 3.7 in next non-bugfix release.
Featuretools may not support Python 3.7 in next non-bugfix release.


Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 6 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 23 (0.4%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 1 (0.0%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 8 (0.2%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
----------------------------------

In [8]:
atom1.dataset

Unnamed: 0,corpus,target
0,the great artist michelangelo i thinking he is...,2.0
1,one of the things i want to acumplish in the f...,2.5
2,its always good to ask people how it feels or ...,3.0
3,in order for students to have a good experienc...,3.0
4,"dear, principal,i will like to star in saying ...",3.5
...,...,...
3906,i think an enjoyable educational activity for ...,4.0
3907,some people may say that oppocite but to start...,3.5
3908,"""unles you try to do somthing beyond what you ...",2.5
3909,having activities after school are good ideas ...,4.0


In [9]:
atom1.textclean()
atom2.textclean()
atom3.textclean()
atom4.textclean()
atom5.textclean()
atom6.textclean()

Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dro

In [10]:
atom1.tokenize(bigram_freq = 100)
atom2.tokenize(bigram_freq = 100)
atom3.tokenize(bigram_freq = 100)
atom4.tokenize(bigram_freq = 100)
atom5.tokenize(bigram_freq = 100)
atom6.tokenize(bigram_freq = 100)

Tokenizing the corpus...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


 --> Creating 2215 bigrams on 721393 locations.
Tokenizing the corpus...
 --> Creating 2215 bigrams on 721393 locations.
Tokenizing the corpus...
 --> Creating 2215 bigrams on 721393 locations.
Tokenizing the corpus...
 --> Creating 2215 bigrams on 721393 locations.
Tokenizing the corpus...
 --> Creating 2215 bigrams on 721393 locations.
Tokenizing the corpus...
 --> Creating 2215 bigrams on 721393 locations.


In [11]:
atom1.vectorize(strategy = 'tfidf')
atom2.vectorize(strategy = 'tfidf')
atom3.vectorize(strategy = 'tfidf')
atom4.vectorize(strategy = 'tfidf')
atom5.vectorize(strategy = 'tfidf')
atom6.vectorize(strategy = 'tfidf')

Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...


In [12]:
atom1.available_models()

Unnamed: 0,acronym,fullname,estimator,module,needs_scaling,accepts_sparse,supports_gpu
0,Dummy,Dummy Estimator,DummyRegressor,sklearn.dummy,False,False,False
1,GP,Gaussian Process,GaussianProcessRegressor,sklearn.gaussian_process._gpr,False,False,False
2,OLS,Ordinary Least Squares,LinearRegression,sklearn.linear_model._base,True,True,True
3,Ridge,Ridge Estimator,Ridge,sklearn.linear_model._ridge,True,True,True
4,Lasso,Lasso Regression,Lasso,sklearn.linear_model._coordinate_descent,True,True,True
5,EN,ElasticNet Regression,ElasticNet,sklearn.linear_model._coordinate_descent,True,True,True
6,Lars,Least Angle Regression,Lars,sklearn.linear_model._least_angle,True,False,True
7,BR,Bayesian Ridge,BayesianRidge,sklearn.linear_model._bayes,True,False,False
8,ARD,Automatic Relevant Determination,ARDRegression,sklearn.linear_model._bayes,True,False,False
9,Huber,Huber Regression,HuberRegressor,sklearn.linear_model._huber,True,False,False


In [13]:
atom1.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom2.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom3.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom4.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom5.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom6.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')


Models: Tree, Bag, ET, RF, AdaB, GBM
Metric: neg_mean_squared_error


Results for Decision Tree:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.7118
Time elapsed: 52.835s
-------------------------------------------------
Total time: 52.835s


Results for Bagging:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0696
Test evaluation --> neg_mean_squared_error: -0.386
Time elapsed: 1m:15s
-------------------------------------------------
Total time: 1m:15s


Results for Extra-Trees:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.3309
Time elapsed: 5m:10s
-------------------------------------------------
Total time: 5m:10s


Results for Random Forest:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error

In [14]:
atom1.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.656977,-0.226396,-2.5,-0.711822,-0.044867,-0.658719,-0.843695
Bag,-0.499651,-0.174217,-2.3,-0.386041,-0.02421,0.100431,-0.621322
ET,-0.463872,-0.16318,-2.25,-0.330886,-0.021015,0.228955,-0.575227
RF,-0.468783,-0.164927,-2.08,-0.335392,-0.021297,0.218455,-0.57913
AdaB,-0.475893,-0.167095,-2.114493,-0.353652,-0.022339,0.175903,-0.594687
GBM,-0.468966,-0.164852,-2.069105,-0.33647,-0.021356,0.215942,-0.58006


In [15]:
atom2.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.64031,-0.227309,-3.0,-0.68062,-0.044389,-0.647397,-0.824997
Bag,-0.472171,-0.170387,-2.05,-0.346574,-0.022654,0.161141,-0.588705
ET,-0.452283,-0.16415,-1.915,-0.31345,-0.020674,0.241314,-0.559866
RF,-0.456841,-0.165687,-1.94,-0.319832,-0.02108,0.225867,-0.565537
AdaB,-0.476371,-0.173235,-2.34081,-0.346345,-0.02283,0.161695,-0.58851
GBM,-0.451571,-0.164613,-1.877838,-0.311704,-0.020732,0.24554,-0.558305


In [16]:
atom3.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.544574,-0.179572,-2.5,-0.538178,-0.031047,-0.612935,-0.733606
Bag,-0.408643,-0.136116,-2.0,-0.276184,-0.016253,0.172269,-0.525532
ET,-0.390411,-0.130455,-1.945,-0.249912,-0.014855,0.251006,-0.499912
RF,-0.391663,-0.130765,-1.99,-0.254211,-0.015082,0.238122,-0.504194
AdaB,-0.395676,-0.132803,-1.85,-0.262979,-0.015583,0.211845,-0.512815
GBM,-0.384742,-0.128779,-1.991632,-0.248262,-0.014765,0.255952,-0.498259


In [17]:
atom4.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.627132,-0.215074,-2.5,-0.65814,-0.041049,-0.546457,-0.811258
Bag,-0.485194,-0.16867,-1.75,-0.348112,-0.02178,0.182026,-0.590011
ET,-0.450279,-0.157414,-1.74,-0.304957,-0.019194,0.283429,-0.552229
RF,-0.45738,-0.159436,-1.675,-0.315421,-0.019765,0.258842,-0.561624
AdaB,-0.470532,-0.166456,-1.771645,-0.337122,-0.021372,0.207851,-0.580622
GBM,-0.450711,-0.157775,-1.676659,-0.307828,-0.01941,0.276684,-0.554823


In [18]:
atom5.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.69186,-0.239286,-3.0,-0.783527,-0.050449,-0.614812,-0.885171
Bag,-0.513566,-0.182283,-2.15,-0.399283,-0.025263,0.177097,-0.631888
ET,-0.491539,-0.175131,-2.085,-0.359893,-0.02286,0.258277,-0.599911
RF,-0.498194,-0.177694,-2.045,-0.371855,-0.023587,0.233625,-0.609799
AdaB,-0.531583,-0.194662,-2.245772,-0.423681,-0.027405,0.126813,-0.650908
GBM,-0.495365,-0.177314,-2.068565,-0.372287,-0.023708,0.232733,-0.610154


In [19]:
atom6.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.635271,-0.223493,-3.0,-0.675388,-0.042499,-0.471001,-0.82182
Bag,-0.510581,-0.182965,-2.2,-0.408715,-0.026171,0.109814,-0.639308
ET,-0.481868,-0.173277,-2.04,-0.36029,-0.023239,0.215286,-0.600241
RF,-0.491105,-0.17678,-2.1,-0.372926,-0.024036,0.187763,-0.610677
AdaB,-0.504676,-0.185653,-2.111927,-0.394994,-0.025911,0.139699,-0.628485
GBM,-0.48614,-0.175736,-2.1562,-0.362047,-0.023534,0.211458,-0.601703
