In [1]:
# import libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
import pandas as pd

In [2]:
# text and numeric classes that use sklearn base libaries
class TextTransformer(BaseEstimator, TransformerMixin):
    """
    Transform text features
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberTransformer(BaseEstimator, TransformerMixin):
    """
    Transform numeric features
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [3]:
# read in your dataframe
df = pd.read_csv('/Users/MatthewPrzybyla/Downloads/NLP/nlp-example/data/DummyDataNLP.csv')

In [4]:
# take a look at the first 5 observations
df.head()

Unnamed: 0,Confirmed_Test,Confirmed_Recovery,Confirmed_New,Text_Feature,Text_Predictor
0,36.434962,82.983704,34.824456,Blue,Label_1
1,75.349163,81.735731,71.921676,Red,Label_2
2,76.678489,33.588094,14.124835,Yellow,Label_3
3,73.356221,79.441778,56.910324,Orange,Label_1
4,74.451946,59.228624,20.042376,Blue,Label_2


In [5]:
# use the term-frequency inverse document frequency vectorizer to transfrom count of text
# into a weighed matrix of term importance
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', norm='l2')

In [6]:
# compile both the TextTransformer and TfidfVectorizer 
# to the text 'Text_Feature' 
color_text = Pipeline([
                ('transformer', TextTransformer(key='Text_Feature')),
                ('vectorizer', vec_tdidf)
                ])

In [7]:
# compile the NumberTransformer to 'Confirmed_Test', 'Confirmed_Recovery', 
# and 'Confirmed_New' numeric features
test_numeric = Pipeline([
                ('transformer', NumberTransformer(key='Confirmed_Test')),
                ])
recovery_numeric = Pipeline([
                ('transformer', NumberTransformer(key='Confirmed_Recovery')),
                ])
new_numeric = Pipeline([
                ('transformer', NumberTransformer(key='Confirmed_New')),
                ])

In [8]:
# combine all of the features, text and numeric together
features = FeatureUnion([('Text_Feature', color_text),
                      ('Confirmed_Test', test_numeric),
                      ('Confirmed_Recovery', recovery_numeric),
                      ('Confirmed_New', new_numeric)
                      ])

In [9]:
# create the classfier from RF
clf = RandomForestClassifier()

In [10]:
# unite the features and classfier together
pipe = Pipeline([('features', features),
                 ('clf',clf)
                 ])

In [11]:
# transform the categorical predictor into numeric
predicted_dummies = pd.get_dummies(df['Text_Predictor'])

In [12]:
# split the data into train and test
# isolate the features from the predicted field
text_numeric_features = ['Text_Feature', 'Confirmed_Test', 'Confirmed_Recovery', 'Confirmed_New']
predictor = 'Text_Predictor'

X_train, X_test, y_train, y_test = train_test_split(df[text_numeric_features], df[predictor], 
                                                    test_size=0.25, random_state=42)

In [13]:
# fit the model
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('Text_Feature', Pipeline(memory=None,
     steps=[('transformer', TextTransformer(key='Text_Feature')), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [14]:
# predict from the test set
preds = pipe.predict(X_test)

In [15]:
# see how you did, since this is randomly generated data,
# I would say this accuracy is pretty good :D
print("Accuracy:",metrics.accuracy_score(y_test, preds))

Accuracy: 0.47368421052631576
