# Tutorial de introducción a Scikit-Learn

In [1]:
class Sentiment:
    POSITIVE = "POSITIVE"
    NEUTRAL = "NEUTRAL"
    NEGATIVE = " NEGATIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        if self.score >= 4:
            return Sentiment.POSITIVE
        else:
            return Sentiment.NEUTRAL
        

In [2]:
import json
file_name = "./data/Books_small.json"

In [3]:
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [4]:
reviews[0].sentiment

'POSITIVE'

## Preparación de los datos

In [5]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [6]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

## Vectorización 'Bag of words'

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer()

# Two steps vectoritation
# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x)

# All in one step
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)


## Classification

#### Linear SVM

In [9]:
from sklearn import svm
clf_svm = svm.SVC(kernel="linear")

In [10]:
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [11]:
test_x[0]

"Every new Myke Cole book is better than the last, and this is no exception. If you haven't read the Shadow Ops series before start with Control Point, but go ahead and order Fortress Frontier and Breach Zone as well - you're going to want them."

In [12]:
test_x_vectors[0].shape, train_x_vectors[0].shape

((1, 7372), (1, 7372))

In [13]:
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U9')

### Decision tree

In [14]:
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier()
clf_tree.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [15]:
clf_tree.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U9')

### Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
clf_nbg = GaussianNB()
clf_nbg.fit(train_x_vectors.toarray(), train_y)

GaussianNB()

In [17]:
clf_nbg.predict(test_x_vectors[0].toarray())

array(['POSITIVE'], dtype='<U9')

### Logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression
clf_lgr = LogisticRegression()
clf_lgr.fit(train_x_vectors, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [19]:
clf_lgr.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U9')

### Model evaluation

#### Accuracy

In [20]:
print("SVM", clf_svm.score(test_x_vectors, test_y))
print("TREE", clf_tree.score(test_x_vectors, test_y))
print("NBG", clf_nbg.score(test_x_vectors.toarray(), test_y))
print("LGR", clf_lgr.score(test_x_vectors, test_y))

SVM 0.8242424242424242
TREE 0.7575757575757576
NBG 0.8121212121212121
LGR 0.8303030303030303


#### F1 score

In [21]:
test_pred_svm = clf_svm.predict(test_x_vectors)
test_pred_tree = clf_tree.predict(test_x_vectors)
test_pred_nbg = clf_nbg.predict(test_x_vectors.toarray())
test_pred_lgr = clf_lgr.predict(test_x_vectors)

In [22]:
from sklearn.metrics import f1_score

print("F1 SVM", f1_score(test_y, test_pred_svm, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F1 TREE", f1_score(test_y, test_pred_tree, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F1 NBG", f1_score(test_y, test_pred_nbg, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F1 LGR", f1_score(test_y, test_pred_lgr, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

F1 SVM [0.91319444 0.21052632 0.22222222]
F1 TREE [0.86725664 0.14705882 0.        ]
F1 NBG [0.89678511 0.08510638 0.09090909]
F1 LGR [0.91370558 0.12244898 0.1       ]


### Try with a bigger dataset

In [23]:
# Prepare new data (2)
import json
file_name2 = "./data/Books_small_10000.json"
reviews2 = []
with open(file_name2) as f:
    for line in f:
        review = json.loads(line)
        reviews2.append(Review(review['reviewText'], review['overall']))

from sklearn.model_selection import train_test_split
training2, test2 = train_test_split(reviews, test_size=0.33, random_state=42)

train2_x = [x.text for x in training2]
train2_y = [x.sentiment for x in training2]

test2_x = [x.text for x in test2]
test2_y = [x.sentiment for x in test2]

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

vectorizer.fit(train_x)
train2_x_vectors = vectorizer.transform(train2_x)
test2_x_vectors = vectorizer.transform(test2_x)


In [24]:
# Buildind models (2)
from sklearn import svm
clf2_svm = svm.SVC(kernel="linear")
clf2_svm.fit(train2_x_vectors, train2_y)

from sklearn import tree
clf2_tree = tree.DecisionTreeClassifier()
clf2_tree.fit(train2_x_vectors, train2_y)

from sklearn.naive_bayes import GaussianNB
clf2_nbg = GaussianNB()
clf2_nbg.fit(train2_x_vectors.toarray(), train2_y)

from sklearn.linear_model import LogisticRegression
clf2_lgr = LogisticRegression()
clf2_lgr.fit(train2_x_vectors, train2_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [25]:
# Evaluate accuracy
print("SVM2", clf2_svm.score(test2_x_vectors, test2_y))
print("TREE2", clf2_tree.score(test2_x_vectors, test2_y))
print("NBG2", clf2_nbg.score(test2_x_vectors.toarray(), test2_y))
print("LGR2", clf2_lgr.score(test2_x_vectors, test2_y))

SVM2 0.8242424242424242
TREE2 0.7575757575757576
NBG2 0.8121212121212121
LGR2 0.8303030303030303


In [26]:
# F1 score
test2_pred_svm = clf2_svm.predict(test2_x_vectors)
test2_pred_tree = clf2_tree.predict(test2_x_vectors)
test2_pred_nbg = clf2_nbg.predict(test2_x_vectors.toarray())
test2_pred_lgr = clf2_lgr.predict(test2_x_vectors)

from sklearn.metrics import f1_score

print("F1 SVM2", f1_score(test2_y, test2_pred_svm, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F1 TREE2", f1_score(test2_y, test2_pred_tree, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F1 NBG2", f1_score(test2_y, test2_pred_nbg, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F1 LGR2", f1_score(test2_y, test2_pred_lgr, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

F1 SVM2 [0.91319444 0.21052632 0.22222222]
F1 TREE2 [0.86772487 0.12903226 0.        ]
F1 NBG2 [0.89678511 0.08510638 0.09090909]
F1 LGR2 [0.91370558 0.12244898 0.1       ]


### Another way to vectorize text
This time taking into account word frecuency. The most a word appears in texts, the less important it is

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer3 = TfidfVectorizer()

vectorizer3.fit(train_x)
train3_x_vectors = vectorizer.transform(train2_x)
test3_x_vectors = vectorizer.transform(test2_x)

In [28]:
train3_x_vectors

<670x7372 sparse matrix of type '<class 'numpy.int64'>'
	with 41455 stored elements in Compressed Sparse Row format>

In [32]:
# Buildind models (3)
from sklearn import svm
clf3_svm = svm.SVC(kernel="linear")
clf3_svm.fit(train3_x_vectors, train2_y)

from sklearn import tree
clf3_tree = tree.DecisionTreeClassifier()
clf3_tree.fit(train3_x_vectors, train2_y)

from sklearn.naive_bayes import GaussianNB
clf3_nbg = GaussianNB()
clf3_nbg.fit(train3_x_vectors.toarray(), train2_y)

from sklearn.linear_model import LogisticRegression
clf3_lgr = LogisticRegression()
clf3_lgr.fit(train3_x_vectors, train2_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [33]:
# Evaluate accuracy
print("SVM2", clf3_svm.score(test3_x_vectors, test2_y))
print("TREE2", clf3_tree.score(test3_x_vectors, test2_y))
print("NBG2", clf3_nbg.score(test3_x_vectors.toarray(), test2_y))
print("LGR2", clf3_lgr.score(test3_x_vectors, test2_y))

SVM2 0.8242424242424242
TREE2 0.7787878787878788
NBG2 0.8121212121212121
LGR2 0.8303030303030303


In [34]:
# F1 score
test3_pred_svm = clf3_svm.predict(test3_x_vectors)
test3_pred_tree = clf3_tree.predict(test3_x_vectors)
test3_pred_nbg = clf3_nbg.predict(test3_x_vectors.toarray())
test3_pred_lgr = clf3_lgr.predict(test3_x_vectors)

from sklearn.metrics import f1_score

print("F3 SVM2", f1_score(test2_y, test3_pred_svm, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F3 TREE2", f1_score(test2_y, test3_pred_tree, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F3 NBG2", f1_score(test2_y, test3_pred_nbg, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("F3 LGR2", f1_score(test2_y, test3_pred_lgr, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

F3 SVM2 [0.91319444 0.21052632 0.22222222]
F3 TREE2 [0.87609075 0.16666667 0.07407407]
F3 NBG2 [0.89678511 0.08510638 0.09090909]
F3 LGR2 [0.91370558 0.12244898 0.1       ]


### Tuning the models (Grid Search)

In [41]:
from sklearn.model_selection import GridSearchCV
parameters = { "kernel": ["linear", "rbf"], "C": [1,4,8,16,32]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 4, 8, 16, 32], 'kernel': ['linear', 'rbf']})

In [46]:
clf.cv_results_

{'mean_fit_time': array([0.1005383 , 0.15210547, 0.09614034, 0.15190659, 0.09634075,
        0.18328671, 0.0989388 , 0.19427948, 0.09654088, 0.18728409]),
 'std_fit_time': array([0.00679542, 0.00749027, 0.00386576, 0.01307642, 0.00279737,
        0.02459818, 0.00428597, 0.00877153, 0.00454031, 0.00633983]),
 'mean_score_time': array([0.01838851, 0.02658391, 0.01798887, 0.02638311, 0.01838846,
        0.02678318, 0.01858792, 0.02698388, 0.01858768, 0.02558455]),
 'std_score_time': array([0.00162379, 0.00080007, 0.00167266, 0.00048895, 0.0010191 ,
        0.00097917, 0.00135556, 0.00063264, 0.00233082, 0.00048918]),
 'param_C': masked_array(data=[1, 1, 4, 4, 8, 8, 16, 16, 32, 32],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                    'linear', 'rbf', 'linear', 'rbf'],
       

In [45]:
clf.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(),
 'iid': 'deprecated',
 'n_jobs': None,
 'param_grid': {'kernel': ['linear', 'rbf'], 'C': [1, 4, 8, 16, 32]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [47]:
print("SVM T", clf_svm.score(test_x_vectors, test_y))
print("F1 SVM T", f1_score(test_y, test_pred_svm, average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

SVM T 0.8242424242424242
F1 SVM T [0.91319444 0.21052632 0.22222222]


### Saving the model

In [48]:
import pickle

In [51]:
with open("./models/sentiment_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

In [52]:
with open("./my_notebooks1/models/sentiment_classifier.pkl", "rb") as f:
    loaded_clf = pickle.load(f)

In [53]:
loaded_clf.predict(test_x_vectors)

array(['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITI