In [86]:
import pandas as pd
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)


In [87]:
from sklearn.model_selection import train_test_split
X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [88]:
print(X_train)

200    the bruins are negotiating with edmonton for l...
25     During “Ask the Mayor” on Tuesday, Mayor Miche...
30     Makenzi Buckely answered the phone at Riversid...
45     While many schools in Boston like to tout thei...
60     A jury in Michigan has ruled that a note handw...
                             ...                        
106    Faith still marvels at the turn her life took ...
14     The 2024 state election is still more than a y...
92     The state is readying to meet the needs of mor...
179    q. i am in seventh grade. i think that my pare...
102    It has been nearly three years since auto plan...
Name: text, Length: 213, dtype: object


In [89]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
# FIT Vectorizer to training data
# build vocab
# count the number of wrods
# transform the original text to a vector

X_train_counts = count_vect.fit_transform(X_train)

In [90]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

In [92]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)



In [93]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

#behaves like a normal classifier 
# it does all previous steps in a singular step
text_clf.fit(X_train, y_train)



In [113]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics


# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Define the pipeline with the best parameters found
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english',
                              ngram_range=(1, 2),
                              max_df=0.5,
                              min_df=3,
                              use_idf=True)),  # use_idf parameter set to True based on grid search
    ('clf', LinearSVC(C=1))  # C parameter set to 1 based on grid search
])

# Fit the model with the training data
pipeline.fit(X_train, y_train)

# Predictions on the test set
predictions = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, predictions))
print(metrics.accuracy_score(y_test, predictions))


              precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       1.00      0.50      0.67         4

    accuracy                           0.83        12
   macro avg       0.90      0.75      0.78        12
weighted avg       0.87      0.83      0.81        12

0.8333333333333334




did a GridSearchCV to find the optimal parameters for both the TfidfVectorizer and the LinearSVC. looked for the best regularization strength for LinearSVC and explore using both the IDF and not using it in the TfidfVectorizer. increased accurary to 83%

Text Preprocessing and Feature Engineering: Configures the TfidfVectorizer for basic text preprocessing, use of n-grams, and term frequency filtering.
Model Optimization: Utilizes GridSearchCV for hyperparameter tuning of both the vectorizer and the classifier.
Evaluation: Outputs a classification report to assess the model's performance more thoroughly.

In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.5, min_df=3)),
    ('clf', LinearSVC())
])

# Parameters for Grid Search
param_grid = {
    'tfidf__use_idf': (True, False),
    'clf__C': [0.1, 1, 10, 100]
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

# Evaluate the model
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))


Fitting 5 folds for each of 8 candidates, totalling 40 fits




Best score: 0.6852713178294574
Best parameters set:
	clf__C: 1
	tfidf__use_idf: True
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       1.00      0.50      0.67         4

    accuracy                           0.83        12
   macro avg       0.90      0.75      0.78        12
weighted avg       0.87      0.83      0.81        12





In [97]:
predictions = text_clf.predict(X_test)

In [98]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[8 0]
 [3 1]]
              precision    recall  f1-score   support

           0       0.73      1.00      0.84         8
           1       1.00      0.25      0.40         4

    accuracy                           0.75        12
   macro avg       0.86      0.62      0.62        12
weighted avg       0.82      0.75      0.69        12



In [99]:
from sklearn import metrics
metrics.accuracy_score(y_test, predictions)

0.75

In [100]:
print(text_clf.predict(["Barack Obama is black"]))
print(text_clf.predict(["Donald Trump is a white man"]))
print(text_clf.predict(["Andrew Yang is an asian man"]))
print(text_clf.predict(["AOC is latina"]))
print(text_clf.predict(["Michelle Obama is black"]))

[0]
[1]
[0]
[0]
[1]


In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics

# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Define the pipeline with the best parameters found
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english',
                              ngram_range=(1, 2),
                              max_df=0.5,
                              min_df=3,
                              use_idf=True)),  # use_idf parameter set to True based on grid search
    ('clf', LinearSVC(C=1))  # C parameter set to 1 based on grid search
])

# Fit the model with the training data
pipeline.fit(X_train, y_train)

# Predictions on the test set
predictions = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, predictions))
print(f"Accuracy: {metrics.accuracy_score(y_test, predictions)}")


              precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       1.00      0.50      0.67         4

    accuracy                           0.83        12
   macro avg       0.90      0.75      0.78        12
weighted avg       0.87      0.83      0.81        12

Accuracy: 0.8333333333333334




In [102]:
print(text_clf.predict(["Barack Obama is black"]))
print(text_clf.predict(["Donald Trump is a white man"]))
print(text_clf.predict(["Andrew Yang is an asian man"]))
print(text_clf.predict(["AOC is latina"]))
print(text_clf.predict(["Michelle Obama is black"]))
print(text_clf.predict(["i love black people"]))


[0]
[1]
[0]
[0]
[1]
[1]


LOGISTIC REGRESSION


In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.linear_model import LogisticRegression


# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Setup the pipeline with Logistic Regression
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.5, min_df=3)),
    ('clf', LogisticRegression(max_iter=1000, C=.01, penalty='none'))
])

# Fit and evaluate the model
pipeline_lr.fit(X_train, y_train)
predictions_lr = pipeline_lr.predict(X_test)
print("Logistic Regression Classification Report")
print(classification_report(y_test, predictions_lr))
print(f"Accuracy: {metrics.accuracy_score(y_test, predictions_lr)}")

#try embeddings rather than tfidf

Logistic Regression Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         4

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12

Accuracy: 1.0




In [104]:
print(pipeline_lr.predict(["Barack Obama is black"]))
print(pipeline_lr.predict(["Donald Trump is a white man"]))
print(pipeline_lr.predict(["Andrew Yang is an asian man"]))
print(pipeline_lr.predict(["AOC is latina"]))
print(pipeline_lr.predict(["Michelle Obama is black"]))
print(pipeline_lr.predict(["i love black lamps"]))
print(pipeline_lr.predict(["my shoes are blue"]))

[1]
[1]
[1]
[0]
[1]
[1]
[0]


In [105]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV



# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Setup the pipeline with Logistic Regression
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

# Define the parameter grid
param_grid_lr = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l2', 'none']
}


# Fit and evaluate the model
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, verbose=1, n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

# Best parameters and score
print("Best score (Logistic Regression):", grid_search_lr.best_score_)
print("Best parameters (Logistic Regression):", grid_search_lr.best_params_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits




Best score (Logistic Regression): 0.7087486157253599
Best parameters (Logistic Regression): {'clf__C': 10, 'clf__penalty': 'l2', 'tfidf__max_df': 0.75, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 1)}


NAIVE BAYES

In [106]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB


# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)


# Setup the pipeline with Multinomial Naive Bayes
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_df=0.5, min_df=3)),
    ('clf', MultinomialNB(alpha=0.01))
])

# Fit and evaluate the model
pipeline_nb.fit(X_train, y_train)
predictions_nb = pipeline_nb.predict(X_test)
print("Naive Bayes Classification Report")
print(classification_report(y_test, predictions_nb))
print(f"Accuracy: {metrics.accuracy_score(y_test, predictions_lr)}")

Naive Bayes Classification Report
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.80      1.00      0.89         4

    accuracy                           0.92        12
   macro avg       0.90      0.94      0.91        12
weighted avg       0.93      0.92      0.92        12

Accuracy: 1.0


In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB


# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)


# Setup the pipeline with Multinomial Naive Bayes
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

param_grid_nb = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'clf__alpha': [0.01, 0.1, 1, 10, 100]
}

grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, verbose=1, n_jobs=-1)
grid_search_nb.fit(X_train, y_train)

print("Best score (Naive Bayes):", grid_search_nb.best_score_)
print("Best parameters (Naive Bayes):", grid_search_nb.best_params_)


Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best score (Naive Bayes): 0.675858250276855
Best parameters (Naive Bayes): {'clf__alpha': 0.01, 'tfidf__max_df': 0.5, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 1)}


RANDOM FORESTS

In [108]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier


# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)


# Setup the pipeline with Random Forest Classifier
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 1))),
    ('clf', RandomForestClassifier(n_estimators=100, min_samples_split=2, max_depth=20))
])

# Fit and evaluate the model
pipeline_rf.fit(X_train, y_train)
predictions_rf = pipeline_rf.predict(X_test)
print("Random Forest Classification Report")
print(classification_report(y_test, predictions_rf))
print(f"Accuracy: {metrics.accuracy_score(y_test, predictions_lr)}")

Random Forest Classification Report
              precision    recall  f1-score   support

           0       0.73      1.00      0.84         8
           1       1.00      0.25      0.40         4

    accuracy                           0.75        12
   macro avg       0.86      0.62      0.62        12
weighted avg       0.82      0.75      0.69        12

Accuracy: 1.0


In [109]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier


# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)


pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english',)),
    ('clf', RandomForestClassifier())
])

param_grid_rf = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, verbose=1, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("Best score (Random Forest):", grid_search_rf.best_score_)
print("Best parameters (Random Forest):", grid_search_rf.best_params_)


Fitting 5 folds for each of 72 candidates, totalling 360 fits


KeyboardInterrupt: 

#############WHERE RACE CLASSIFICATION STARTS ##############

In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score


# Load your dataset
combined_df = pd.read_csv('datasets/combined_dataset_new_labels.csv')

# Filter out the 'N/A - not specified' category if necessary
combined_df = combined_df[combined_df['race_label'] != 'N/A - not specified']

# Splitting the dataset into features and target
X = combined_df['text']
y = combined_df['race_label']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_rf2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))  # Adjusting class weights
])

# Fit the model
pipeline_rf2.fit(X_train, y_train)

# Make predictions
predictions = pipeline_rf2.predict(X_test)

# Print the classification report and accuracy
print("Classification Report:")
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")

#try to do feature engineering
#locations, topics 
# topic - could try using topic modeling



Classification Report:
                precision    recall  f1-score   support

         Asian       0.00      0.00      0.00         2
         Black       0.42      1.00      0.59         5
        Latino       0.00      0.00      0.00         1
Multiple Races       1.00      0.50      0.67         2
     Not White       0.00      0.00      0.00         5
         White       0.40      0.67      0.50         3

      accuracy                           0.44        18
     macro avg       0.30      0.36      0.29        18
  weighted avg       0.29      0.44      0.32        18

Accuracy: 0.4444444444444444


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV



# Load your dataset
combined_df = pd.read_csv('datasets/combined_dataset_new_labels.csv')

# Filter out the 'N/A - not specified' category if necessary
combined_df = combined_df[combined_df['race_label'] != 'N/A - not specified']

# Splitting the dataset into features and target
X = combined_df['text']
y = combined_df['race_label']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_rf2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))  # Adjusting class weights
])


# Define the parameter grid to search
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}

# Configure GridSearchCV
grid_search = GridSearchCV(pipeline_rf2, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")



Fitting 5 folds for each of 216 candidates, totalling 1080 fits




[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   0.2s
[CV] END clf__max_depth=None, clf_

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [111]:
pipeline_rf2.predict(["pipeline_rf2In 1940, 60 percent of employed black women worked as domestic servants; today the number is down to 2.2 percent, while 60 percent hold white- collar jobs."]) 

array(['Black'], dtype=object)

In [112]:
pipeline_rf2.predict(["In 1958, 44 percent of hispanics said they would move if a family became their next door neighbor; today the figure is 1 percent."])

array(['White'], dtype=object)