Optimize Random Forest #1

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Load the dataset


# Convert data into DataFrame
df = pd.read_csv('dataset-clean.csv')

# Step 2: Preprocess the data
# OneHotEncode 'SOURCE_FIELD_NAME' with 'handle_unknown="ignore"'
preprocessor = ColumnTransformer(
    transformers=[('source_field_name', OneHotEncoder(handle_unknown='ignore'), ['SOURCE_FIELD_NAME'])],
    remainder='passthrough')

# Split the data into features and target
X = df[['SOURCE_FIELD_NAME']]
y = df['FIELD_NAME']

# Step 3: Create and train RandomForest model with default parameters
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Step 4: Train the model
pipeline.fit(X, y)

# Step 5: Evaluate the model on the same data (you can later use a validation set for better evaluation)
y_pred = pipeline.predict(X)

# Step 6: Accuracy and Classification Report
accuracy = accuracy_score(y, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y, y_pred))

# Step 7: Make predictions with new data input
new_data = pd.DataFrame({"SOURCE_FIELD_NAME": ['CUST_NO', 'APPL_CODE1','APPL_CODE11', 'APPL_CODE12', 'APPLIC_TRID1', 'APPLIC_TRID51']})
predicted_field_name = pipeline.predict(new_data)

# Print the predicted FIELD_NAME for each SOURCE_FIELD_NAME
for source, predicted_field in zip(new_data["SOURCE_FIELD_NAME"], predicted_field_name):
    print(f"Predicted FIELD_NAME for '{source}': {predicted_field}")

# new_column_names = ['CUST_NO', 'APPL_CODE1', 'APPL_CODE2']
# # Function to predict SQL data type based on the column name
# def predict_data_type(column_name):
#     column_name = column_name.lower()

#     if 'account' in column_name or 'phn' in column_name or 'number' in column_name:
#         return 'VARCHAR(255)'  # Account Number, Phone Number, etc. are typically VARCHAR
#     elif 'date' in column_name or 'dob' in column_name:
#         return 'DATE'  # Date fields like Date of Birth are DATE
#     elif 'address' in column_name or 'street' in column_name:
#         return 'VARCHAR(255)'  # Address fields are VARCHAR
#     else:
#         return 'TEXT'  # Default to TEXT if no specific pattern is found

# # Predict standard names and data types for new column variations
# predicted_column_names_and_types = []
# for new_column, predicted_name in zip(new_column_names, predicted_names):
#     predicted_type = predict_data_type(predicted_name)
#     predicted_column_names_and_types.append((predicted_name, predicted_type))

# # SQL CREATE TABLE generation based on predicted column names and types
# def generate_create_table_query(predicted_column_names_and_types, table_name):
#     sql_query = f"CREATE TABLE {table_name} (\n"

#     for col_name, col_type in predicted_column_names_and_types:
#         # Convert column name to lowercase with underscores (standard SQL style)
#         sql_query += f"    {col_name.replace(' ', '_').lower()} {col_type} NOT NULL,\n"

#     # Remove the trailing comma for the last column
#     sql_query = sql_query.rstrip(',\n') + "\n);"

#     return sql_query

# # Generate the SQL CREATE TABLE query based on the predicted column names and types
# table_name = "predicted_table"
# create_table_query = generate_create_table_query(predicted_column_names_and_types, table_name)

# # Output the predicted column names, data types, and the SQL query
# print("\nPredicted Column Names and Types:")
# for col_name, col_type in predicted_column_names_and_types:
#     print(f"{col_name}: {col_type}")

# print("\nGenerated SQL CREATE TABLE Query:")
# print(create_table_query)

Model Accuracy: 92.23%
Classification Report:
                                precision    recall  f1-score   support

                        ABA_NO       1.00      1.00      1.00         1
                       ABBR_NM       0.50      1.00      0.67         3
                      ACCTN_TP       1.00      1.00      1.00         1
                  ACCT_APPL_CD       0.67      1.00      0.80         2
                  ACCT_BAL_AMT       1.00      1.00      1.00         1
                       ACCT_BK       1.00      1.00      1.00         1
                  ACCT_CNDT_CD       1.00      1.00      1.00         1
                  ACCT_COLL_CD       1.00      1.00      1.00         2
                   ACCT_CTG_CD       1.00      1.00      1.00         1
                     ACCT_CURR       1.00      1.00      1.00         1
            ACCT_HOLD_BIRTH_DT       1.00      1.00      1.00         1
                  ACCT_HOLD_NM       1.00      1.00      1.00         1
             ACCT

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Predicted FIELD_NAME for 'CUST_NO': PERS_ID
Predicted FIELD_NAME for 'APPL_CODE1': APPL_CD
Predicted FIELD_NAME for 'APPL_CODE11': APPL_CD
Predicted FIELD_NAME for 'APPL_CODE12': ACCT_NO
Predicted FIELD_NAME for 'APPLIC_TRID1': TRAILER_ID
Predicted FIELD_NAME for 'APPLIC_TRID51': ACCT_NO


Optimize Random Forest #2

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Load the dataset
df = pd.read_csv('dataset-clean.csv')

# Step 2: Check for missing data
if df.isnull().sum().any():
    print("Missing values found. Please handle them before proceeding.")
else:
    print("No missing values in the dataset.")

# Step 3: Preprocess the data
# OneHotEncode 'SOURCE_FIELD_NAME' with 'handle_unknown="ignore"'
preprocessor = ColumnTransformer(
    transformers=[('source_field_name', OneHotEncoder(handle_unknown='ignore'), ['SOURCE_FIELD_NAME'])],
    remainder='passthrough')

# Split the data into features and target
X = df.drop(columns=['FIELD_NAME'])  # Using all features except target column
y = df['FIELD_NAME']

# Step 4: Train-Test Split for better evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Create and train RandomForest model with simplified hyperparameter grid
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Simplified parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],  # Reduce the number of estimators
    'classifier__max_depth': [None, 10],  # Reduce the depth
    'classifier__min_samples_split': [2, 5],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2, error_score='raise')

# Step 6: Fit the model with the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = grid_search.predict(X_test)

# Step 8: Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 9: Make predictions with new data input
new_data = pd.DataFrame({"SOURCE_FIELD_NAME": ["CUST_NO"]})
predicted_field_name = grid_search.predict(new_data)
print(f"Predicted FIELD_NAME for 'CUST_NO': {predicted_field_name[0]}")


No missing values in the dataset.
Fitting 5 folds for each of 8 candidates, totalling 40 fits




MemoryError: could not allocate 50823168 bytes