In [1]:
#imports
import numpy as np
import pandas as pd
from tpot import TPOTClassifier



In [2]:
#load the training data
x_train = pd.read_csv('train_data/tfidf_train_train.csv', index_col=[0])
y_train = pd.read_csv('train_data/y_train_train.csv', index_col=[0])
#load the test data
x_test = pd.read_csv('train_data/tfidf_train_test.csv', index_col=[0])
y_test = pd.read_csv('train_data/y_train_test.csv', index_col=[0])
# solving problems with index after loading data
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [3]:

# Define TPOT classifier with desired settings
tpot = TPOTClassifier(
    generations=3,
    population_size=10,
    verbosity=2,
    scoring='accuracy',
    cv=3,
    random_state=42,
    n_jobs=-1
)

# Define search space for each model
rf_search_space = {
    'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [10, 50, 100],
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 20],
        'max_features': ['auto', 'sqrt']
    }
}

dt_search_space = {
    'sklearn.tree.DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 20],
        'max_features': ['auto', 'sqrt']
    }
}

lr_search_space = {
    'sklearn.linear_model.LogisticRegression': {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10]
    }
}

# Define dictionary containing all search spaces
search_spaces = {
    'RandomForestClassifier': rf_search_space,
    'DecisionTreeClassifier': dt_search_space,
    'LogisticRegression': lr_search_space
}

# Loop over each model and fit TPOT to find best pipeline and hyperparameters
for model_name, search_space in search_spaces.items():
    print(f"Training {model_name} model...")
    tpot.set_params(
        generations=3,
        population_size=10,
        verbosity=2,
        scoring='accuracy',
        cv=3,
        random_state=42,
        n_jobs=-1,
        config_dict=search_space
    )
    tpot.fit(x_train, y_train)
    print(f"Best pipeline for {model_name}:")
    print(tpot.fitted_pipeline_)
    print(f"Accuracy for {model_name}: {tpot.score(x_test, y_test)}")
    print('\n')


Training RandomForestClassifier model...


  y = column_or_1d(y, warn=True)


                                                                              
Generation 1 - Current best internal CV score: 0.9405916823607141
                                                                            
Generation 2 - Current best internal CV score: 0.9417131920822482
                                                                            
Generation 3 - Current best internal CV score: 0.9417131920822482
                                                          
Best pipeline: RandomForestClassifier(RandomForestClassifier(RandomForestClassifier(input_matrix, criterion=gini, max_depth=20, max_features=auto, n_estimators=100), criterion=entropy, max_depth=10, max_features=sqrt, n_estimators=50), criterion=gini, max_depth=10, max_features=sqrt, n_estimators=50)
Best pipeline for RandomForestClassifier:
Pipeline(steps=[('stackingestimator-1',
                 StackingEstimator(estimator=RandomForestClassifier(max_depth=20,
                                            

  y = column_or_1d(y, warn=True)


Accuracy for RandomForestClassifier: 0.947495046702519


Training DecisionTreeClassifier model...


  y = column_or_1d(y, warn=True)


                                                                            
Generation 1 - Current best internal CV score: 0.9161617057590622
                                                                            
Generation 2 - Current best internal CV score: 0.9161617057590622
                                                                            
Generation 3 - Current best internal CV score: 0.9190106617548145
                                                                            
Best pipeline: DecisionTreeClassifier(CombineDFs(input_matrix, input_matrix), criterion=gini, max_depth=10, max_features=sqrt)
Best pipeline for DecisionTreeClassifier:
Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('functiontransformer-1',
                                                 FunctionTransformer(func=<function copy at 0x000002031B339550>)),
                                                ('functiontransformer-2',
                             

  y = column_or_1d(y, warn=True)


Accuracy for DecisionTreeClassifier: 0.9236484574016417


Training LogisticRegression model...


  y = column_or_1d(y, warn=True)


                                                                            
Generation 1 - Current best internal CV score: 0.9499879639511531
Optimization Progress:  50%|█████     | 20/40 [05:18<04:17, 12.88s/pipeline]