In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score


# Load your dataset
combined_df = pd.read_csv('datasets/combined_dataset_new_labels.csv')

# Filter out the 'N/A - not specified' category if necessary
combined_df = combined_df[combined_df['race_label'] != 'N/A - not specified']

# Splitting the dataset into features and target
X = combined_df['text']
y = combined_df['race_label']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_rf2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))  # Adjusting class weights
])

# Fit the model
pipeline_rf2.fit(X_train, y_train)

# Make predictions
predictions = pipeline_rf2.predict(X_test)

# Print the classification report and accuracy
print("Classification Report:")
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")

#try to do feature engineering
#locations, topics 
# topic - could try using topic modeling



Classification Report:
                precision    recall  f1-score   support

         Asian       0.00      0.00      0.00         2
         Black       0.42      1.00      0.59         5
        Latino       0.00      0.00      0.00         1
Multiple Races       1.00      0.50      0.67         2
     Not White       0.00      0.00      0.00         5
         White       0.40      0.67      0.50         3

      accuracy                           0.44        18
     macro avg       0.30      0.36      0.29        18
  weighted avg       0.29      0.44      0.32        18

Accuracy: 0.4444444444444444


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
# FIT Vectorizer to training data
# build vocab
# count the number of wrods
# transform the original text to a vector

X_train_counts = count_vect.fit_transform(X_train)

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

In [8]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)



In [9]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

#behaves like a normal classifier 
# it does all previous steps in a singular step
text_clf.fit(X_train, y_train)



In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV



# Load your dataset
combined_df = pd.read_csv('datasets/combined_dataset_new_labels.csv')

# Filter out the 'N/A - not specified' category if necessary
combined_df = combined_df[combined_df['race_label'] != 'N/A - not specified']

# Splitting the dataset into features and target
X = combined_df['text']
y = combined_df['race_label']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_rf2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))  # Adjusting class weights
])


# Define the parameter grid to search
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}

# Configure GridSearchCV
grid_search = GridSearchCV(pipeline_rf2, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")



Fitting 5 folds for each of 216 candidates, totalling 1080 fits




[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_df=0.5, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END clf__max_depth=None, clf_

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
pipeline_rf2.predict(["pipeline_rf2In 1940, 60 percent of employed black women worked as domestic servants; today the number is down to 2.2 percent, while 60 percent hold white- collar jobs."]) 

NotFittedError: The TF-IDF vectorizer is not fitted

In [None]:
pipeline_rf2.predict(["In 1958, 44 percent of hispanics said they would move if a family became their next door neighbor; today the figure is 1 percent."])