# Trying a pipeline-based approach

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("/content/drive/MyDrive/B565 Data/train.csv")
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [2]:
story = []
for i in train_data['full_text']:
    story.append(i.lower().replace("\n", "").strip())

In [3]:
story = np.array(story).reshape(-1, 1)
story.shape

(3911, 1)

In [4]:
cohesion = np.array(train_data['cohesion']).astype(float)
syntax = np.array(train_data['syntax']).astype(float)
vocabulary = np.array(train_data['vocabulary']).astype(float)
phraseology = np.array(train_data['phraseology']).astype(float)
grammar = np.array(train_data['grammar']).astype(float)
conventions = np.array(train_data['conventions']).astype(float)
conventions.shape

(3911,)

In [5]:
test_data = pd.read_csv("/content/drive/MyDrive/B565 Data/test.csv")
test_data

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [6]:
story_pred = []
for i in test_data['full_text']:
    story_pred.append(i.lower().replace("\n", "").strip())
story_pred = np.array(story_pred).reshape(-1, 1)

In [7]:
from atom import ATOMRegressor

atom1 = ATOMRegressor(story, cohesion, test_size = 0.33, random_state = 42, verbose = 2)
atom2 = ATOMRegressor(story, syntax, test_size = 0.33, random_state = 42, verbose = 2)
atom3 = ATOMRegressor(story, vocabulary, test_size = 0.33, random_state = 42, verbose = 2)
atom4 = ATOMRegressor(story, phraseology, test_size = 0.33, random_state = 42, verbose = 2)
atom5 = ATOMRegressor(story, grammar, test_size = 0.33, random_state = 42, verbose = 2)
atom6 = ATOMRegressor(story, conventions, test_size = 0.33, random_state = 42, verbose = 2)

Woodwork may not support Python 3.7 in next non-bugfix release.
Featuretools may not support Python 3.7 in next non-bugfix release.


Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 6 (0.1%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 23 (0.4%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 1 (0.0%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
Outlier values: 8 (0.2%)
-------------------------------------
Train set size: 2621
Test set size: 1290

Algorithm task: regression.

Shape: (3911, 2)
Memory: 9.21 MB
Scaled: False
Categorical features: 1 (100.0%)
----------------------------------

In [8]:
atom1.dataset

Unnamed: 0,corpus,target
0,the great artist michelangelo i thinking he is...,2.0
1,one of the things i want to acumplish in the f...,2.5
2,its always good to ask people how it feels or ...,3.0
3,in order for students to have a good experienc...,3.0
4,"dear, principal,i will like to star in saying ...",3.5
...,...,...
3906,i think an enjoyable educational activity for ...,4.0
3907,some people may say that oppocite but to start...,3.5
3908,"""unles you try to do somthing beyond what you ...",2.5
3909,having activities after school are good ideas ...,4.0


In [9]:
atom1.textclean()
atom2.textclean()
atom3.textclean()
atom4.textclean()
atom5.textclean()
atom6.textclean()

Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dropping 2847 numbers from 994 documents.
 --> Dropping punctuation from the text.
Cleaning the corpus...
 --> Decoding unicode characters to ascii.
 --> Converting text to lower case.
 --> Dropping 0 emails from 0 documents.
 --> Dropping 0 URL links from 0 documents.
 --> Dropping 0 HTML tags from 0 documents.
 --> Dropping 3 emojis from 3 documents.
 --> Dro

In [10]:
atom1.tokenize(trigram_freq = 100)
atom2.tokenize(trigram_freq = 100)
atom3.tokenize(trigram_freq = 100)
atom4.tokenize(trigram_freq = 100)
atom5.tokenize(trigram_freq = 100)
atom6.tokenize(trigram_freq = 100)

Tokenizing the corpus...
 --> Creating 656 trigrams on 126742 locations.
Tokenizing the corpus...
 --> Creating 656 trigrams on 126742 locations.
Tokenizing the corpus...
 --> Creating 656 trigrams on 126742 locations.
Tokenizing the corpus...
 --> Creating 656 trigrams on 126742 locations.
Tokenizing the corpus...
 --> Creating 656 trigrams on 126742 locations.
Tokenizing the corpus...
 --> Creating 656 trigrams on 126742 locations.


In [11]:
atom1.vectorize(strategy = 'tfidf')
atom2.vectorize(strategy = 'tfidf')
atom3.vectorize(strategy = 'tfidf')
atom4.vectorize(strategy = 'tfidf')
atom5.vectorize(strategy = 'tfidf')
atom6.vectorize(strategy = 'tfidf')

Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...
Fitting Vectorizer...
Vectorizing the corpus...


In [12]:
atom1.available_models()

Unnamed: 0,acronym,fullname,estimator,module,needs_scaling,accepts_sparse,supports_gpu
0,Dummy,Dummy Estimator,DummyRegressor,sklearn.dummy,False,False,False
1,GP,Gaussian Process,GaussianProcessRegressor,sklearn.gaussian_process._gpr,False,False,False
2,OLS,Ordinary Least Squares,LinearRegression,sklearn.linear_model._base,True,True,True
3,Ridge,Ridge Estimator,Ridge,sklearn.linear_model._ridge,True,True,True
4,Lasso,Lasso Regression,Lasso,sklearn.linear_model._coordinate_descent,True,True,True
5,EN,ElasticNet Regression,ElasticNet,sklearn.linear_model._coordinate_descent,True,True,True
6,Lars,Least Angle Regression,Lars,sklearn.linear_model._least_angle,True,False,True
7,BR,Bayesian Ridge,BayesianRidge,sklearn.linear_model._bayes,True,False,False
8,ARD,Automatic Relevant Determination,ARDRegression,sklearn.linear_model._bayes,True,False,False
9,Huber,Huber Regression,HuberRegressor,sklearn.linear_model._huber,True,False,False


In [13]:
atom1.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom2.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom3.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom4.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom5.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')
atom6.run(models = ['Tree', 'Bag', 'ET', 'RF', 'AdaB', 'GBM'], metric = 'mse')


Models: Tree, Bag, ET, RF, AdaB, GBM
Metric: neg_mean_squared_error


Results for Decision Tree:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.6934
Time elapsed: 49.444s
-------------------------------------------------
Total time: 49.445s


Results for Bagging:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0662
Test evaluation --> neg_mean_squared_error: -0.3602
Time elapsed: 1m:02s
-------------------------------------------------
Total time: 1m:02s


Results for Extra-Trees:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_error: -0.0
Test evaluation --> neg_mean_squared_error: -0.3241
Time elapsed: 4m:14s
-------------------------------------------------
Total time: 4m:14s


Results for Random Forest:
Fit ---------------------------------------------
Train evaluation --> neg_mean_squared_erro

In [14]:
atom1.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.637984,-0.220779,-2.5,-0.693411,-0.043665,-0.615817,-0.832713
Bag,-0.479031,-0.167406,-2.25,-0.360242,-0.022599,0.160547,-0.600202
ET,-0.457934,-0.161382,-2.085,-0.324134,-0.020591,0.244689,-0.569327
RF,-0.463736,-0.163659,-2.19,-0.33445,-0.021275,0.22065,-0.578317
AdaB,-0.471271,-0.164846,-2.019115,-0.342743,-0.021675,0.201324,-0.585443
GBM,-0.459329,-0.161239,-2.086885,-0.327032,-0.020664,0.237935,-0.571867


In [15]:
atom2.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.613953,-0.217285,-2.5,-0.629845,-0.040838,-0.524499,-0.793628
Bag,-0.47407,-0.172286,-2.2,-0.353324,-0.023263,0.144803,-0.59441
ET,-0.443721,-0.16103,-2.055,-0.304722,-0.020123,0.26244,-0.552016
RF,-0.452539,-0.164393,-2.11,-0.318894,-0.02104,0.228137,-0.564707
AdaB,-0.465464,-0.172347,-2.239726,-0.331997,-0.022212,0.196423,-0.576192
GBM,-0.449288,-0.163338,-1.93107,-0.311048,-0.020573,0.247127,-0.557717


In [16]:
atom3.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.546512,-0.17851,-3.0,-0.522093,-0.029671,-0.564727,-0.72256
Bag,-0.412636,-0.136439,-1.9,-0.273419,-0.015997,0.180557,-0.522894
ET,-0.387589,-0.129115,-2.135,-0.246714,-0.014606,0.26059,-0.496704
RF,-0.395818,-0.131789,-2.01,-0.252621,-0.014929,0.242889,-0.502614
AdaB,-0.408938,-0.137319,-2.25734,-0.264302,-0.015655,0.207879,-0.514103
GBM,-0.393043,-0.13082,-2.070104,-0.251883,-0.014857,0.2451,-0.501879


In [17]:
atom4.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.597287,-0.204036,-2.5,-0.612984,-0.037986,-0.440354,-0.782933
Bag,-0.46845,-0.163319,-1.95,-0.335345,-0.020951,0.212027,-0.57909
ET,-0.445225,-0.15572,-1.88,-0.300871,-0.018942,0.293031,-0.548517
RF,-0.451981,-0.157866,-1.895,-0.308339,-0.019374,0.275484,-0.555283
AdaB,-0.470635,-0.164432,-1.766012,-0.330001,-0.020715,0.224583,-0.574457
GBM,-0.445926,-0.156057,-1.895956,-0.300843,-0.018959,0.293098,-0.548491


In [18]:
atom5.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.666279,-0.232862,-3.0,-0.759496,-0.047588,-0.565285,-0.871491
Bag,-0.511628,-0.181749,-1.9,-0.399535,-0.025207,0.176578,-0.632088
ET,-0.488841,-0.174119,-1.985,-0.357855,-0.022673,0.262477,-0.59821
RF,-0.494457,-0.176062,-1.99,-0.367926,-0.023269,0.241722,-0.606569
AdaB,-0.511865,-0.186979,-2.173225,-0.39101,-0.02532,0.194147,-0.625308
GBM,-0.48429,-0.172999,-1.974664,-0.359455,-0.0228,0.259181,-0.599546


In [19]:
atom6.evaluate()

Unnamed: 0,neg_mean_absolute_error,neg_mean_absolute_percentage_error,max_error,neg_mean_squared_error,neg_mean_squared_log_error,r2,neg_root_mean_squared_error
Tree,-0.687209,-0.242399,-3.5,-0.785853,-0.050304,-0.711595,-0.886483
Bag,-0.501085,-0.179308,-2.3,-0.392298,-0.025155,0.14557,-0.626337
ET,-0.478957,-0.172323,-2.085,-0.353561,-0.022819,0.229941,-0.59461
RF,-0.485682,-0.174968,-1.975,-0.366404,-0.023694,0.201969,-0.605313
AdaB,-0.512989,-0.189037,-2.166667,-0.396912,-0.02605,0.135521,-0.63001
GBM,-0.471932,-0.169831,-2.110183,-0.347094,-0.022444,0.244027,-0.589147
