In [56]:
from imblearn.pipeline import Pipeline

In [57]:
from sklearn.datasets import load_breast_cancer
# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from imblearn.pipeline import Pipeline, make_pipeline

from imblearn import FunctionSampler

from sklearn.metrics import f1_score

# Trying it on my data

## Loading Data

In [58]:
# Load the data
df_for_pipelining = pd.read_pickle("objects/crafted_features_df.pkl")

# Create separate dataframes for each group
df_for_pipelining_train = df_for_pipelining.loc[df_for_pipelining['policy_type'] == 'TRAINING' ].copy()
df_for_pipelining_train.reset_index(inplace=True, drop=True)

for dataframe in [df_for_pipelining_train]:
    dataframe.drop(columns=['source_policy_number', 'policy_type', 'contains_synthetic',
           'policy_segment_id', 'annotations', 'sentences'], inplace=True)

In [59]:
clean_annotation_features = pd.read_pickle("objects/clean_annotation_features.pkl")

## Pipeline

### Issue with SF'ing causing too few + or - support with some classifiers.  Looking at the numbers.

In [60]:
list_of_18_classifiers = ['Contact', 'Contact_E_Mail_Address', 'Contact_Phone_Number', 
                       'Identifier_Cookie_or_similar_Tech', 'Identifier_Device_ID', 'Identifier_IMEI',
                        'Identifier_MAC', 'Identifier_Mobile_Carrier',
                        'Location', 'Location_Cell_Tower', 'Location_GPS', 'Location_WiFi',
                        'SSO', 'Facebook_SSO',
                        '1st_party', '3rd_party',
                        'PERFORMED', 'NOT_PERFORMED']

In [61]:
def sentence_filtering(X, y):
        # filter y
    y2 = y.loc[sf_filter].copy()
    y2.reset_index(inplace=True, drop=True)
    
        # filter X
    X2 = X.loc[sf_filter].copy()
    X2.reset_index(inplace=True, drop=True)
    
    return X2, y2

support_after_sf_table = pd.DataFrame({"Whole dataset support":["tbd"], "Positive support":["tbd"], "Negative support":["tbd"]}, index=["Contact"])

for classifier in list_of_18_classifiers:
    
    #Separate into x and y
    y = df_for_pipelining_train[classifier]
    X = pd.concat([df_for_pipelining_train['segment_text'],df_for_pipelining_train.loc[:,'contact info':]], axis=1).copy()

    # filtering the table to get the list object from the same row that lists the classifier
    classifier_features = clean_annotation_features[ clean_annotation_features['annotation'] == classifier ]     \
                            .reset_index().at[0,'features']
    
     # true/false boolean for sentence filtering:
    sf_filter = ((X[classifier_features] > 0)\
                 .sum(axis=1) > 0 )
    # display(sf_filter)
    
    #total + support in whole dataset (w/o SF'ing)
    support_after_sf_table.loc[classifier, "Whole dataset support"] = y.value_counts()[1]

    X, y = sentence_filtering(X, y)
    
    support_after_sf_table.loc[classifier, "Positive support"] = y.value_counts().get(1, 0)
    support_after_sf_table.loc[classifier, "Negative support"] = y.value_counts().get(0, 0)

support_after_sf_table

Unnamed: 0,Whole dataset support,Positive support,Negative support
Contact,128,115,251
Contact_E_Mail_Address,662,496,61
Contact_Phone_Number,346,269,218
Identifier_Cookie_or_similar_Tech,596,558,121
Identifier_Device_ID,332,251,20
Identifier_IMEI,50,43,0
Identifier_MAC,85,78,62
Identifier_Mobile_Carrier,60,39,30
Location,537,462,216
Location_Cell_Tower,89,62,12


It can be seen that with Sentence Filtering, some of the targets have too little positive or negative support. It's likely that those models wouldn't be selected as the best ones by the grid search.

I also infer that Story et al. didn't use the same sentence filtering process as I have done, since their table 1 on page 4 [(link to paper)](https://usableprivacy.org/static/files/story_pal_2019.pdf) shows sentence filtering was helpful for training a model to find "Identifier_IMEI", but with my process I won't be able to train a model at all (no negative support)

## Alternative filtering process for when either + support or - support is below 100 after SF'ing

Using an arbirary cut off of 75. In further work I would like to study this more to find a more rigorously definted cut off.

In [62]:
def sentence_filtering(X, y, df_filter=sf_filter):
    """
    Filter the X and y data using Sentence Filtering, 
    or if this leaves too few data, filter using balanced downsize filtering.
    
    Inputs: 
        X: X data
        y: y data
        df_filter:  a filter (boolean series) to use to filter the data. 
                    Intended to be sf_filter (sentence filtering)
                    or balanced_downzise_filter (all the positive cases plus an equally sized random sample of negative cases)
    Outputs:
        X2: filtered X data
        y2: filtered y data
    
    """
    
        # filter y
    y2 = y.loc[df_filter].copy()
    y2.reset_index(inplace=True, drop=True)
    
        # filter X
    X2 = X.loc[df_filter].copy()
    X2.reset_index(inplace=True, drop=True)
    
        # check whether this sentence filtering leaves enough data (arbitrary > 75)
        # if not, use balanced downsizing instead:
    if df_filter.equals(sf_filter) & (
        ( y2.value_counts().get(1, 0) < 75 ) or ( y2.value_counts().get(0, 0) < 75 )
    ):
        X2, y2 = sentence_filtering(X, y, df_filter=balanced_downzise_filter)
    
    return X2, y2

In [63]:
support_after_sf_table = pd.DataFrame({"Whole dataset support":["tbd"], "Positive support":["tbd"], "Negative support":["tbd"]}, index=["Contact"])

for classifier in list_of_18_classifiers:
    
    #Separate into x and y
    y = df_for_pipelining_train[classifier]
    X = pd.concat([df_for_pipelining_train['segment_text'],df_for_pipelining_train.loc[:,'contact info':]], axis=1).copy()

    # filtering the table to get the list object from the same row that lists the classifier
    classifier_features = clean_annotation_features[ clean_annotation_features['annotation'] == classifier ]     \
                            .reset_index().at[0,'features']
    
     # true/false boolean for sentence filtering:
    sf_filter = ((X[classifier_features] > 0)\
                 .sum(axis=1) > 0 )
    
    # true/false boolean for balanced downsizing filter:
    positive_rows = (y == 1)
    negative_rows = (y == 0)
    balanced_downzise_filter = (
        positive_rows |
        negative_rows.where(negative_rows == True).dropna().sample(n=positive_rows.sum(), replace=False)
    )
    
    # total + support in whole dataset (w/o SF'ing)
    support_after_sf_table.loc[classifier, "Whole dataset support"] = y.value_counts()[1]

    X, y = sentence_filtering(X, y, sf_filter)
    
    support_after_sf_table.loc[classifier, "Positive support"] = y.value_counts().get(1, 0)
    support_after_sf_table.loc[classifier, "Negative support"] = y.value_counts().get(0, 0)
    
support_after_sf_table

Unnamed: 0,Whole dataset support,Positive support,Negative support
Contact,128,115,251
Contact_E_Mail_Address,662,662,662
Contact_Phone_Number,346,269,218
Identifier_Cookie_or_similar_Tech,596,558,121
Identifier_Device_ID,332,332,332
Identifier_IMEI,50,50,50
Identifier_MAC,85,85,85
Identifier_Mobile_Carrier,60,60,60
Location,537,462,216
Location_Cell_Tower,89,89,89


These figures look more reasonable. I will take this amendment to the Sentence Filtering process forwards to my pipeline.

## Full pipeline:

In [64]:
classifier = "Identifier_IMEI"

In [65]:
#Separate into x and y
y = df_for_pipelining_train[classifier]
X = pd.concat([df_for_pipelining_train['segment_text'],df_for_pipelining_train.loc[:,'contact info':]], axis=1).copy()

In [66]:
# filtering the table to get the list object from the same row that lists the classifier
classifier_features = clean_annotation_features[ clean_annotation_features['annotation'] == classifier ]     \
                        .reset_index().at[0,'features']

classifier_features

['imei', 'international mobile equipment', 'equipment id']

Doing tfidf with ColumnTransformer:

In [67]:
tfidf_unigrams = TfidfVectorizer(ngram_range=(1,1), stop_words='english', binary=True)
tfidf_withbigrams = TfidfVectorizer(ngram_range=(1,2), stop_words='english', binary=True)

# Create the column transformations list with columns to apply to
col_transform_unigrams = [('unigrams_only', tfidf_unigrams, 'segment_text')]
col_transform_withbigrams = [('with_bigrams', tfidf_withbigrams, 'segment_text')]

Imblearn pipeline:

In [68]:
estimator = [
    ('sentence_filtering', FunctionSampler(func=sentence_filtering, validate=False)),
    ('tfidf', ColumnTransformer(col_transform_withbigrams, remainder='passthrough')), 
    ('model', LogisticRegression(random_state=1, max_iter=1000))
]
     
pipe = Pipeline(estimator)


param_grid = [
    {
        'model': [LogisticRegression(random_state=1, max_iter=1000)],
        'sentence_filtering': [FunctionSampler(func=sentence_filtering, validate=False), None],
        'tfidf': [ColumnTransformer(col_transform_withbigrams, remainder='passthrough'),
                  ColumnTransformer(col_transform_unigrams, remainder='passthrough')]
    },
    {
        'model': [SVC(kernel='linear', class_weight='balanced', random_state=1)],
        'model__C': [0.1, 1, 10],
        'sentence_filtering': [FunctionSampler(func=sentence_filtering, validate=False), None],
        'tfidf': [ColumnTransformer(col_transform_withbigrams, remainder='passthrough'),
                  ColumnTransformer(col_transform_unigrams, remainder='passthrough')]
    }
]
grid_search_object = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = 5, verbose=1, n_jobs=-1, scoring='f1')

In [69]:
# %%time
fitted_search = grid_search_object.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


Results:

In [70]:
from sklearn.metrics import classification_report
classifier_prediction = fitted_search.predict(X)
print(classification_report(y, classifier_prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8018
           1       0.98      1.00      0.99        50

    accuracy                           1.00      8068
   macro avg       0.99      1.00      1.00      8068
weighted avg       1.00      1.00      1.00      8068



In [71]:
# fitted_search.cv_results_
display(fitted_search.best_estimator_)

861, 319

955, 774

In [72]:
list_of_18_classifiers = ['Contact', 'Contact_E_Mail_Address', 'Contact_Phone_Number', 
                       'Identifier_Cookie_or_similar_Tech', 'Identifier_Device_ID', 'Identifier_IMEI',
                        'Identifier_MAC', 'Identifier_Mobile_Carrier',
                        'Location', 'Location_Cell_Tower', 'Location_GPS', 'Location_WiFi',
                        'SSO', 'Facebook_SSO',
                        '1st_party', '3rd_party',
                        'PERFORMED', 'NOT_PERFORMED'] # cross-checked from table on pg 4 of the paper

model_results = pd.Series(range(len(list_of_18_classifiers)),
                          index=list_of_18_classifiers, dtype=object)

classifier_prediction = fitted_search.predict(X)

# saving the model results for future use
model_results[classifier] = [fitted_search, y, classifier_prediction]

In [73]:
model_results

Contact                                                                              0
Contact_E_Mail_Address                                                               1
Contact_Phone_Number                                                                 2
Identifier_Cookie_or_similar_Tech                                                    3
Identifier_Device_ID                                                                 4
Identifier_IMEI                      [GridSearchCV(cv=5,\n             estimator=Pi...
Identifier_MAC                                                                       6
Identifier_Mobile_Carrier                                                            7
Location                                                                             8
Location_Cell_Tower                                                                  9
Location_GPS                                                                        10
Location_WiFi                              

In [74]:
results_table = pd.DataFrame(model_results, columns=["Frequency in train set"]).copy()

# Total instances in dataset
results_table.loc[classifier, "Frequency in train set"] = \
    df_for_pipelining_train[classifier].sum()

# Neg F1
results_table.loc[classifier, "Neg F1"] = \
    f1_score(model_results[classifier][1].copy(), model_results[classifier][2].copy(), pos_label=0)

# Pos F1
results_table.loc[classifier, "Pos F1"] = \
    f1_score(model_results[classifier][1].copy(), model_results[classifier][2].copy(), pos_label=1)

# Sentence Filtering
results_table.loc[classifier, "Sentence Filtering"] = \
    str(fitted_search.best_estimator_.steps[0][1])

# Including Bigrams or not
results_table.loc[classifier, "including bigrams or not"] = \
    fitted_search.best_estimator_.named_steps["tfidf"].get_params()["transformers"][0][0]

# Model type
results_table.loc[classifier, "SVM or Logistic Regression"] = \
    fitted_search.best_estimator_.named_steps["model"]

In [75]:
str(fitted_search.best_estimator_.steps[0][1])

'None'

In [76]:
results_table

Unnamed: 0,Frequency in train set,Neg F1,Pos F1,Sentence Filtering,including bigrams or not,SVM or Logistic Regression
Contact,0,,,,,
Contact_E_Mail_Address,1,,,,,
Contact_Phone_Number,2,,,,,
Identifier_Cookie_or_similar_Tech,3,,,,,
Identifier_Device_ID,4,,,,,
Identifier_IMEI,50,0.999938,0.990099,,with_bigrams,"SVC(C=0.1, class_weight='balanced', kernel='li..."
Identifier_MAC,6,,,,,
Identifier_Mobile_Carrier,7,,,,,
Location,8,,,,,
Location_Cell_Tower,9,,,,,


# Appendix - other stuff I had tried earlier

## How do I get the with/without bigrams, tfidf & model info from the `fitted_search` object to display more neatly in the results table?

In [55]:
print(fitted_search.best_estimator_.named_steps["model"])

some_model = fitted_search.best_estimator_.named_steps["model"]
some_model

SVC(C=0.1, class_weight='balanced', kernel='linear', random_state=1)


## Having fun investigating the performance of all the different params in the grid search

In [80]:
print(fitted_search.best_estimator_.steps[1][1].get_params()["transformers"][0][1])

TfidfVectorizer(binary=True, ngram_range=(1, 2), stop_words='english')


In [173]:
cv_results_df = pd.DataFrame(fitted_search.cv_results_)

ordered_cv_results_df = pd.concat([
    cv_results_df.loc[cv_results_df['rank_test_score'] < 5], 
    cv_results_df.loc[cv_results_df['rank_test_score'] > 12]
],            axis=0)\
                        .set_index("rank_test_score")\
                                                        .sort_index().loc[:,"param_model":]

# print(str(thing).partition('\n')[2].partition('\n')[2].partition('\n')[2].partition('\n')[0])

In [195]:
col_trans_from_results = cv_results_df["param_tfidf"][0]
col_trans_from_results.get_params()['transformers'][0][0]

'tfidf_transform_bi'

In [202]:
#ordered_cv_results_df["with bigrams?"] = ordered_cv_results_df["param_tfidf"].apply(lambda x: x.get_params()['transformers'][0][0])
ordered_cv_results_df["param_tfidf"] = ordered_cv_results_df["param_tfidf"].apply(lambda x: x.get_params()['transformers'][0][0])

ordered_cv_results_df

Unnamed: 0_level_0,param_model,param_sentence_filtering,param_tfidf,param_model__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,with bigrams?
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,"SVC(C=1, class_weight='balanced', kernel='line...",,tfidf_transform_bi,1.0,"{'model': SVC(C=1, class_weight='balanced', ke...",0.854473,0.833107,0.854497,0.850727,0.831341,0.844829,0.010398,tfidf_transform_bi
2,"SVC(C=1, class_weight='balanced', kernel='line...",FunctionSampler(func=<function sentence_filter...,tfidf_transform_bi,1.0,"{'model': SVC(C=1, class_weight='balanced', ke...",0.852901,0.827869,0.849934,0.842384,0.826667,0.839951,0.010915,tfidf_transform_bi
3,"SVC(C=1, class_weight='balanced', kernel='line...",,tfidf_transform_uni,1.0,"{'model': SVC(C=1, class_weight='balanced', ke...",0.84252,0.839735,0.849741,0.830729,0.800522,0.832649,0.017179,tfidf_transform_uni
4,"SVC(C=1, class_weight='balanced', kernel='line...",FunctionSampler(func=<function sentence_filter...,tfidf_transform_uni,1.0,"{'model': SVC(C=1, class_weight='balanced', ke...",0.842801,0.82777,0.849279,0.832021,0.793734,0.829121,0.019266,tfidf_transform_uni
13,"LogisticRegression(max_iter=1000, random_state=1)",FunctionSampler(func=<function sentence_filter...,tfidf_transform_uni,,"{'model': LogisticRegression(max_iter=1000, ra...",0.824034,0.784884,0.806452,0.80758,0.795948,0.80378,0.01304,tfidf_transform_uni
14,"LogisticRegression(max_iter=1000, random_state=1)",FunctionSampler(func=<function sentence_filter...,tfidf_transform_bi,,"{'model': LogisticRegression(max_iter=1000, ra...",0.818966,0.786647,0.80814,0.802343,0.797688,0.802757,0.010743,tfidf_transform_bi
15,"SVC(C=1, class_weight='balanced', kernel='line...",,tfidf_transform_uni,10.0,"{'model': SVC(C=1, class_weight='balanced', ke...",0.76776,0.776243,0.789757,0.777317,0.742021,0.77062,0.01593,tfidf_transform_uni
16,"SVC(C=1, class_weight='balanced', kernel='line...",FunctionSampler(func=<function sentence_filter...,tfidf_transform_uni,10.0,"{'model': SVC(C=1, class_weight='balanced', ke...",0.76881,0.764045,0.79567,0.781768,0.734748,0.769008,0.020357,tfidf_transform_uni


## Confirming working baseline code (code from Mark's notebook)

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

# load in data
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=111)


# define placeholder pipeline
estimator = [
    ('scaler', StandardScaler()),
    ('dim_reducer', PCA()),
    ('model', LogisticRegression())
]
pipe = Pipeline(estimator)
# Define a parameter grid
param_grid = [
    {
        'model': [LogisticRegression()],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'dim_reducer': [PCA()], # CH note: adding None to the list causes an error, there must be some other way.
        'dim_reducer__n_components': [1, 2, 3, 4],
        'model__penalty': ['l1', 'l2'],
        'model__C': [10**x for x in range(-5, 6)]
    },
    {
        'model': [SVC()],
        'scaler': [StandardScaler()],
        'dim_reducer':[PCA()],
        'model__C': [10**x for x in range(-5, 6)],
        'model__gamma': [10**x for x in range(-3, 3)]
    }
]


# Instantiate a gridsearch
grid = GridSearchCV(pipe, param_grid, cv = 5, verbose = 2)
fitted_grid = grid.fit(X_train, y_train)

Fitting 5 folds for each of 242 candidates, totalling 1210 fits
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1e-05, model__penalty=

[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=0.001, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.1s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=0.001, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=0.001, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=0.001, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=0.01, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=0.01, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA()

[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_redu

[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10000, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=1, model=LogisticRegression(), model__C=10000, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), 

[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer

[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_compone

[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1000, model__penalty=l1, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1000, model__penalty=l1, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1000, model__penalty=l1, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), 

[CV] END dim_reducer=PCA(), dim_reducer__n_components=2, model=LogisticRegression(), model__C=100000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1e-05, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_redu

[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=0.001, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=0.01, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=0.01, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=0.01, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=0.01, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=0.01, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PC

[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reduc

[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=1000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10000, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10000, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=3, model=LogisticRegression(), model__C=10000, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(

[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=1e-05, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=0.0001, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(

[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=0.01, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=0.01, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=0.01, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=0.01, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=0.1, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=0.1, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_

[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=100, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=100, model__penalty=l1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer_

[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10000, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10000, model__penalty=l2, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), dim_reducer__n_components=4, model=LogisticRegression(), model__C=10000, model__penalty=l2, scaler=MinMaxScaler(); total time=   0.0s
[CV] END dim_reducer=PCA

[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.0001, model__gamma=0.1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model

[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0.1, model__gamma=0.01, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=0

[CV] END dim_reducer=PCA(), model=SVC(), model__C=10, model__gamma=100, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10, model__gamma=100, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10, model__gamma=100, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10, model__gamma=100, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=100, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=100, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=100, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=100, model__gamma=0.001, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=100, model

[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=0.1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=0.1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=1, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, model__gamma=10, scaler=StandardScaler(); total time=   0.0s
[CV] END dim_reducer=PCA(), model=SVC(), model__C=10000, mo

440 fits failed out of a total of 1210.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
440 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chinchcliffe/opt/anaconda3/envs/priv_pol_nlp/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chinchcliffe/opt/anaconda3/envs/priv_pol_nlp/lib/python3.10/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/chinchcliffe/opt/anaconda3/envs/priv_pol_nlp/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, se

In [4]:
fitted_grid.best_estimator_

In [5]:
fitted_grid.score(X_test, y_test)

0.9824561403508771

### FunctionTransformer didn't work:

In [20]:
# def row_filter(df, column, value):
#     global y
#     y = df_for_pipelining_train[classifier]
#     y = y[1:]
#     return df[df[column] != value]

# def nothing_filter(X, y):
#     X_new = X
#     y_new = y
#     return X_new, y_new

# row_filter_transformer = FunctionTransformer(row_filter, kw_args=
#                                              {'column': 'segment_text', 'value': 'home find my phone blog'})

# nothing_filter_transformer = FunctionTransformer(nothing_filter(X, y=y))

# def empty_transformer(X):
#     return X

# empty_function_transformer = FunctionTransformer(empty_transformer)

# def length_printer_tf(X):
#     print(f"Before tfidf X shape is {X.shape}", flush=True)
# #     print(f"y shape is {y.shape}")
#     global some_var
#     some_var = 0
#     return X

# length_print_transformer = FunctionTransformer(length_printer_tf)

# def length_printer_tf2(X):
#     print(f"After tfidf X shape is {X.shape}", flush=True)
#     return X

# length_print_transformer2 = FunctionTransformer(length_printer_tf2)

## Trying FunctionSampler:

In [33]:
def empty_sampler(X, y):
    return X, y

empty_function_sampler = FunctionSampler(func=empty_sampler, validate=False)

def length_printer_before(X, y):
    print(f"Before pipeline, X shape is {X.shape} and y shape is {y.shape}", flush=True)
#     print(f"y shape is {y.shape}", flush=True)
    return X, y

before_length_print_sampler = FunctionSampler(func=length_printer_before, validate=False)

def length_printer_after(X, y):
    print(f"After pipeline, X shape is {X.shape} and y shape is {y.shape}", flush=True)
#     print(f"y shape is {y.shape}", flush=True)
    return X, y

after_length_print_sampler = FunctionSampler(func=length_printer_after, validate=False)

# tf-idf

In [6]:
city_df_x = pd.DataFrame({'city': ['London', 'Toronto', 'Paris', 'Tokyo', 'Cape Town', 'Utrecht', 'London', 'Toronto', 'Paris', 'Tokyo', 'Cape Town', 'Utrecht'],
                        'review': ['Super cool amazing', 'Very nice place', 'Its ok, good museums', 'Sandy is there so I want to go', 'Its ok, interesting', 'Definitely go if Yugioh', 'Super cool amazing', 'Very nice place', 'Its ok, good museums', 'Sandy is there so I want to go', 'Its ok, interesting', 'Definitely go if Yugioh']})


In [7]:
city_df_y = pd.DataFrame({'going?': [1,0,0,1,0,1,1,0,0,0,0,1]})

In [8]:
pd.concat([city_df_x, city_df_y], axis=1)

Unnamed: 0,city,review,going?
0,London,Super cool amazing,1
1,Toronto,Very nice place,0
2,Paris,"Its ok, good museums",0
3,Tokyo,Sandy is there so I want to go,1
4,Cape Town,"Its ok, interesting",0
5,Utrecht,Definitely go if Yugioh,1
6,London,Super cool amazing,1
7,Toronto,Very nice place,0
8,Paris,"Its ok, good museums",0
9,Tokyo,Sandy is there so I want to go,0


In [9]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
type(TfidfVectorizer())

sklearn.feature_extraction.text.TfidfVectorizer

In [12]:
# Create the column transformations list + columns to which to apply
col_transforms = [('city_transform', OneHotEncoder(), ['city']),
                ('review_transform', TfidfVectorizer(), 'review')]

# Create the column transformer
col_trans = ColumnTransformer(col_transforms)

# Fit
col_trans.fit(city_df_x)

In [13]:
transformed_x = col_trans.transform(city_df_x) 
y = np.ravel(city_df_y)

In [14]:
estimator = [
    ('transformer', ColumnTransformer(col_transforms)),
    ('model', LogisticRegression())
]
pipe = Pipeline(estimator)

param_grid = [
    {
        'transformer': [ColumnTransformer(col_transforms)],
        'model': [LogisticRegression(), SVC()]
    }]
grid = GridSearchCV(pipe, param_grid, cv = 3, verbose = 2)
fitted_grid = grid.fit(city_df_x, y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END model=LogisticRegression(), transformer=ColumnTransformer(transformers=[('city_transform', OneHotEncoder(), ['city']),
                                ('review_transform', TfidfVectorizer(),
                                 'review')]); total time=   0.0s
[CV] END model=LogisticRegression(), transformer=ColumnTransformer(transformers=[('city_transform', OneHotEncoder(), ['city']),
                                ('review_transform', TfidfVectorizer(),
                                 'review')]); total time=   0.0s
[CV] END model=LogisticRegression(), transformer=ColumnTransformer(transformers=[('city_transform', OneHotEncoder(), ['city']),
                                ('review_transform', TfidfVectorizer(),
                                 'review')]); total time=   0.0s
[CV] END model=SVC(), transformer=ColumnTransformer(transformers=[('city_transform', OneHotEncoder(), ['city']),
                                (

In [15]:
fitted_grid.best_estimator_

In [16]:
fitted_grid.score(city_df_x, y)

0.9166666666666666

In [17]:
fitted_grid.cv_results_

{'mean_fit_time': array([0.00745201, 0.00567834]),
 'std_fit_time': array([0.00153004, 0.00134352]),
 'mean_score_time': array([0.00330559, 0.00294034]),
 'std_score_time': array([0.00119027, 0.00025069]),
 'param_model': masked_array(data=[LogisticRegression(), SVC()],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_transformer': masked_array(data=[ColumnTransformer(transformers=[('city_transform', OneHotEncoder(), ['city']),
                                                    ('review_transform', TfidfVectorizer(),
                                                     'review')])                                  ,
                    ColumnTransformer(transformers=[('city_transform', OneHotEncoder(), ['city']),
                                                    ('review_transform', TfidfVectorizer(),
                                                     'review')])                                  ],
              mask=[False, False],
   

# Trying something out

In [4]:
# Setting up some data. 
city_df = pd.DataFrame({'city': ['London', 'Toronto', 'Paris'],
                        'review': ['Super cool amazing', 'Very nice place', 'Its ok, good museums']})

# Create the column transformations list + columns to which to apply
col_transforms = [
#     ('city_transform', OneHotEncoder(), ['city']),
                ('review_transform', TfidfVectorizer(), 'review')]

# Create the column transformer
col_trans = ColumnTransformer(col_transforms)

# Fit
col_trans.fit(city_df)

# Feature names
col_trans.get_feature_names_out()

array(['review_transform__amazing', 'review_transform__cool',
       'review_transform__good', 'review_transform__its',
       'review_transform__museums', 'review_transform__nice',
       'review_transform__ok', 'review_transform__place',
       'review_transform__super', 'review_transform__very'], dtype=object)