# How to add more features

In [5]:
import csv
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer


In [2]:
path = 'hatespeech_text_label_vote_RESTRICTED_100K.csv'
tweets = []
labels = []
with open(path) as fi:
    data = csv.reader(fi, delimiter='\t')
    for row in data:
        tweets.append(row[0])
        labels.append(row[1]) 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)


In [7]:
mypipe = Pipeline([('vec', CountVectorizer()),
                  ('clf',MultinomialNB())])
mypipe.fit(X_train, y_train)


Pipeline(steps=[('vec', CountVectorizer()), ('clf', MultinomialNB())])

In [8]:
# in this cell, we use the same pipeline as in the cell before
# but we add two extra features ('columns' to the matrix of independent variables)
# namely the length of the text in characters and the length in words
mypipe2 = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', FunctionTransformer(lambda x: [[len(e), len(e.split())] for e in x])),
             ('text_features', CountVectorizer())
            ])),
    ('clf', MultinomialNB())])
mypipe2.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 FunctionTransformer(func=<function <lambda> at 0x7feecf6a33a0>)),
                                                ('text_features',
                                                 CountVectorizer())])),
                ('clf', MultinomialNB())])

In [10]:
# for illustration

ft = FunctionTransformer(lambda x: [[len(line), len(line.split())] for line in x])
ft.transform(X_train)

[[100, 12],
 [74, 6],
 [137, 22],
 [117, 18],
 [106, 14],
 [140, 18],
 [130, 21],
 [139, 18],
 [102, 11],
 [140, 22],
 [124, 18],
 [122, 22],
 [136, 16],
 [119, 16],
 [140, 20],
 [137, 18],
 [106, 9],
 [118, 14],
 [78, 14],
 [95, 17],
 [127, 14],
 [100, 11],
 [112, 27],
 [105, 16],
 [140, 22],
 [99, 14],
 [127, 19],
 [138, 21],
 [133, 20],
 [112, 11],
 [120, 19],
 [106, 18],
 [125, 20],
 [140, 28],
 [119, 15],
 [66, 10],
 [140, 18],
 [112, 20],
 [137, 19],
 [110, 11],
 [102, 16],
 [158, 27],
 [136, 23],
 [75, 11],
 [104, 15],
 [132, 17],
 [144, 25],
 [139, 26],
 [130, 22],
 [110, 15],
 [100, 14],
 [99, 14],
 [138, 26],
 [133, 18],
 [137, 22],
 [121, 16],
 [133, 21],
 [137, 22],
 [139, 17],
 [105, 14],
 [140, 23],
 [134, 21],
 [144, 22],
 [59, 7],
 [137, 15],
 [138, 20],
 [99, 14],
 [110, 15],
 [134, 21],
 [95, 13],
 [138, 13],
 [139, 26],
 [148, 26],
 [139, 21],
 [121, 15],
 [128, 17],
 [140, 16],
 [140, 20],
 [113, 15],
 [56, 7],
 [133, 15],
 [140, 18],
 [124, 14],
 [125, 15],
 [63, 8