In [None]:
pip install scikit-learn fuzzywuzzy

In [6]:
pip install pandas


Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
Collecting tzdata>=2022.7
  Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Collecting pytz>=2020.1
  Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2024.2 tzdata-2024.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from fuzzywuzzy import fuzz
from collections import Counter

# Sample dataset of column name variations
column_variations = [
    "ACC_NUMBER", "ACCNBR", "ACC_NBR", "ACNMBR",
    "ACCOUNT_NUMBER", "ACCNT_NMBR", "ACCNT_NBR"
]

# Function to predict the standard column name from a new variation
def predict_column_name(new_column_name, column_variations):
    # Calculate the similarity of the new column name to the existing ones
    similarities = [(col, fuzz.ratio(new_column_name, col)) for col in column_variations]

    # Sort the columns by similarity score (higher score means more similar)
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Get the most similar column name (typically the first one after sorting)
    predicted_name = similarities[0][0]

    return predicted_name

# Example: Predict for the new column name 'ACCNTNMBR'
new_column_name = "AC_BR"
predicted_name = predict_column_name(new_column_name, column_variations)

print(f"Predicted column name for '{new_column_name}': {predicted_name}")


Predicted column name for 'AC_BR': ACC_NBR


**Using Fuzzy algorithm**

In [35]:
import pandas as pd
from fuzzywuzzy import fuzz

# Define the dataset of column name variations and their corresponding standard names
column_variations = [
    'CUST_NO', 'OBL_CODE', 'ACCT_NBR',
    'CUST_TYPE',
    'ACCOUNT_NUMBER', 'ACCNT_NMBR', 'ACCNT_NBR',
    'CUS_NAME', 'CUSTOMER_NAME', 'CLIENT_NAME',
    'PHN', 'PHONE_NUMBER', 'CONTACT_NUMBER',
    'DOB', 'DATE_OF_BIRTH', 'BIRTHDATE',
    'ADDR', 'ADDRESS', 'STREET_ADDRESS'
]

standard_names = [
    'PERS_ID', 'PERS_ID', 'PERS_ID',
    'ID_SUB_CTG',
    'Account Number', 'Account Number', 'Account Number',
    'Customer Name', 'Customer Name', 'Customer Name',
    'Phone Number', 'Phone Number', 'Phone Number',
    'Date of Birth', 'Date of Birth', 'Date of Birth',
    'Street Address', 'Street Address', 'Street Address'
]

# Define new column name variations for prediction
new_column_names = [
    'ac_br', 'd_birth', 'con_nmbr', 'CU_NO',
    'CUST_TYPE', 'CUSTYPE', 'CTYPE'
    ]

# A dictionary to map common abbreviations to their full names
abbreviation_map = {
    'ac_br': 'Account Number',
    'd_birth': 'Date of Birth',
    'con_nmbr': 'Phone Number',
    'add_r': 'Street Address',
    # Add more mappings here as necessary
}

# Function to predict the standard column name using fuzzy matching or abbreviation map
def predict_column_name(new_name, column_variations, standard_names, abbreviation_map):
    # Check if the new name is in the abbreviation map
    if new_name in abbreviation_map:
        return abbreviation_map[new_name]

    # Use fuzzy matching to compare the new name with all column variations
    similarities = [(var, fuzz.ratio(new_name, var)) for var in column_variations]
    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Check for threshold similarity (to avoid incorrect matches)
    threshold = 60  # Lowered threshold for better matching
    if similarities[0][1] < threshold:
        return "Unknown"  # If the match is not strong enough, return "Unknown"

    # Return the standard name corresponding to the highest similarity match
    max_similarity_idx = column_variations.index(similarities[0][0])
    return standard_names[max_similarity_idx]

# Function to predict the SQL data type based on the column name
def predict_data_type(column_name):
    # Map column names to SQL data types
    column_name = column_name.lower()

    if 'account' in column_name or 'phn' in column_name or 'number' in column_name:
        return 'VARCHAR(255)'  # Account Number, Phone Number, etc. are typically VARCHAR
    elif 'date' in column_name or 'dob' in column_name:
        return 'DATE'  # Date fields like Date of Birth are DATE
    elif 'address' in column_name or 'street' in column_name:
        return 'VARCHAR(255)'  # Address fields are VARCHAR
    else:
        return 'TEXT'  # Default to TEXT if no specific pattern is found

# Predict standard column names and their corresponding data types for the new variations
predicted_column_names_and_types = []
for new_column in new_column_names:
    predicted_name = predict_column_name(new_column, column_variations, standard_names, abbreviation_map)

    # If "Unknown" is returned, handle this case.
    if predicted_name == "Unknown":
        print(f"Warning: Unable to predict a standard name for column '{new_column}'.")
        predicted_type = 'TEXT'  # Default to TEXT type if not recognized
    else:
        predicted_type = predict_data_type(predicted_name)

    predicted_column_names_and_types.append((predicted_name, predicted_type))

# SQL CREATE TABLE generation based on predicted column names and types
def generate_create_table_query(predicted_column_names_and_types, table_name):
    sql_query = f"CREATE TABLE {table_name} (\n"

    for col_name, col_type in predicted_column_names_and_types:
        # Convert column name to lowercase with underscores (standard SQL style)
        sql_query += f"    {col_name.replace(' ', '_').lower()} {col_type} NOT NULL,\n"

    # Remove the trailing comma for the last column
    sql_query = sql_query.rstrip(',\n') + "\n);"

    return sql_query

# Generate the SQL CREATE TABLE query based on the predicted column names and types
table_name = "predicted_table"
create_table_query = generate_create_table_query(predicted_column_names_and_types, table_name)

# Output the predicted column names, data types, and the SQL query
print("Predicted Column Names and Types:")
for col_name, col_type in predicted_column_names_and_types:
    print(f"{col_name}: {col_type}")

print("\nGenerated SQL CREATE TABLE Query:")
print(create_table_query)


Predicted Column Names and Types:
Account Number: VARCHAR(255)
Date of Birth: DATE
Phone Number: VARCHAR(255)
PERS_ID: TEXT
ID_SUB_CTG: TEXT
ID_SUB_CTG: TEXT
ID_SUB_CTG: TEXT

Generated SQL CREATE TABLE Query:
CREATE TABLE predicted_table (
    account_number VARCHAR(255) NOT NULL,
    date_of_birth DATE NOT NULL,
    phone_number VARCHAR(255) NOT NULL,
    pers_id TEXT NOT NULL,
    id_sub_ctg TEXT NOT NULL,
    id_sub_ctg TEXT NOT NULL,
    id_sub_ctg TEXT NOT NULL
);


**Using Random Forest with Training Data**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# Define the dataset of column name variations and their corresponding standard names
column_variations = [
    'ACC_NUMBER', 'ACCNBR', 'ACC_NBR', 'ACNMBR',
    'ACCOUNT_NUMBER', 'ACCNT_NMBR', 'ACCNT_NBR',
    'CUS_NAME', 'CUSTOMER_NAME', 'CLIENT_NAME',
    'PHN', 'PHONE_NUMBER', 'CONTACT_NUMBER',
    'DOB', 'DATE_OF_BIRTH', 'BIRTHDATE',
    'ADDR', 'ADDRESS', 'STREET_ADDRESS'
]

standard_names = [
    'Account Number', 'Account Number', 'Account Number', 'Account Number',
    'Account Number', 'Account Number', 'Account Number',
    'Customer Name', 'Customer Name', 'Customer Name',
    'Phone Number', 'Phone Number', 'Phone Number',
    'Date of Birth', 'Date of Birth', 'Date of Birth',
    'Street Address', 'Street Address', 'Street Address'
]

# Define new column name variations for prediction
new_column_names = ['ac_br', 'd_birth', 'con_nmbr']

# Create a DataFrame from the column variations and their corresponding standard names
data = pd.DataFrame({
    'column_variation': column_variations,
    'standard_name': standard_names
})

# Split the dataset into features (X) and target (y)
X = data['column_variation']
y = data['standard_name']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that first transforms the text data and then applies a classifier
model = make_pipeline(
    TfidfVectorizer(),  # Converts column names into numerical features
    RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest Classifier
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Now, let's use the trained model to predict new column names
predicted_names = model.predict(new_column_names)

# Function to predict SQL data type based on the column name
def predict_data_type(column_name):
    column_name = column_name.lower()

    if 'account' in column_name or 'phn' in column_name or 'number' in column_name:
        return 'VARCHAR(255)'  # Account Number, Phone Number, etc. are typically VARCHAR
    elif 'date' in column_name or 'dob' in column_name:
        return 'DATE'  # Date fields like Date of Birth are DATE
    elif 'address' in column_name or 'street' in column_name:
        return 'VARCHAR(255)'  # Address fields are VARCHAR
    else:
        return 'TEXT'  # Default to TEXT if no specific pattern is found

# Predict standard names and data types for new column variations
predicted_column_names_and_types = []
for new_column, predicted_name in zip(new_column_names, predicted_names):
    predicted_type = predict_data_type(predicted_name)
    predicted_column_names_and_types.append((predicted_name, predicted_type))

# SQL CREATE TABLE generation based on predicted column names and types
def generate_create_table_query(predicted_column_names_and_types, table_name):
    sql_query = f"CREATE TABLE {table_name} (\n"

    for col_name, col_type in predicted_column_names_and_types:
        # Convert column name to lowercase with underscores (standard SQL style)
        sql_query += f"    {col_name.replace(' ', '_').lower()} {col_type} NOT NULL,\n"

    # Remove the trailing comma for the last column
    sql_query = sql_query.rstrip(',\n') + "\n);"

    return sql_query

# Generate the SQL CREATE TABLE query based on the predicted column names and types
table_name = "predicted_table"
create_table_query = generate_create_table_query(predicted_column_names_and_types, table_name)

# Output the predicted column names, data types, and the SQL query
print("\nPredicted Column Names and Types:")
for col_name, col_type in predicted_column_names_and_types:
    print(f"{col_name}: {col_type}")

print("\nGenerated SQL CREATE TABLE Query:")
print(create_table_query)


AttributeError: partially initialized module 'pandas' has no attribute '_pandas_datetime_CAPI' (most likely due to a circular import)

**Random Forest without Training**

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

# Define the dataset of column name variations and their corresponding standard names
column_variations = [
    'CUST_NO', 'OBL_CODE', 'ACCT_NBR',
    'CUST_TYPE',
    'ACC_NUMBER', 'ACCNBR', 'ACC_NBR', 'ACNMBR',
    'ACCOUNT_NUMBER', 'ACCNT_NMBR', 'ACCNT_NBR',
    'CUS_NAME', 'CUSTOMER_NAME', 'CLIENT_NAME',
    'PHN', 'PHONE_NUMBER', 'CONTACT_NUMBER',
    'DOB', 'DATE_OF_BIRTH', 'BIRTHDATE',
    'ADDR', 'ADDRESS', 'STREET_ADDRESS',
    'AC_BR', 'D_BIRTH', 'CON_NMBR'  # Additional variations added for the new prediction
]

standard_names = [
    'PERS_ID', 'PERS_ID', 'PERS_ID',
    'ID_SUB_CTG',
    'Account Number', 'Account Number', 'Account Number', 'Account Number',
    'Account Number', 'Account Number', 'Account Number',
    'Customer Name', 'Customer Name', 'Customer Name',
    'Phone Number', 'Phone Number', 'Phone Number',
    'Date of Birth', 'Date of Birth', 'Date of Birth',
    'Street Address', 'Street Address', 'Street Address',
    'Account Number', 'Date of Birth', 'Phone Number'  # These correspond to the new variations
]

# Define new column name variations for prediction
new_column_names = ['ac_br', 'd_birth', 'con_nmbr', 'OBL_CODE']

# Create a DataFrame from the column variations and their corresponding standard names
data = pd.DataFrame({
    'column_variation': column_variations,
    'standard_name': standard_names
})

# Create a pipeline that first transforms the text data and then applies a classifier
model = make_pipeline(
    TfidfVectorizer(),  # Converts column names into numerical features
    RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest Classifier
)

# Train the model on all the available data
model.fit(data['column_variation'], data['standard_name'])

# Now, let's use the trained model to predict new column names
predicted_names = model.predict(new_column_names)

# Function to predict SQL data type based on the column name
def predict_data_type(column_name):
    column_name = column_name.lower()

    if 'account' in column_name or 'phn' in column_name or 'number' in column_name:
        return 'VARCHAR(255)'  # Account Number, Phone Number, etc. are typically VARCHAR
    elif 'date' in column_name or 'dob' in column_name:
        return 'DATE'  # Date fields like Date of Birth are DATE
    elif 'address' in column_name or 'street' in column_name:
        return 'VARCHAR(255)'  # Address fields are VARCHAR
    else:
        return 'TEXT'  # Default to TEXT if no specific pattern is found

# Predict standard names and data types for new column variations
predicted_column_names_and_types = []
for new_column, predicted_name in zip(new_column_names, predicted_names):
    predicted_type = predict_data_type(predicted_name)
    predicted_column_names_and_types.append((predicted_name, predicted_type))

# SQL CREATE TABLE generation based on predicted column names and types
def generate_create_table_query(predicted_column_names_and_types, table_name):
    sql_query = f"CREATE TABLE {table_name} (\n"

    for col_name, col_type in predicted_column_names_and_types:
        # Convert column name to lowercase with underscores (standard SQL style)
        sql_query += f"    {col_name.replace(' ', '_').lower()} {col_type} NOT NULL,\n"

    # Remove the trailing comma for the last column
    sql_query = sql_query.rstrip(',\n') + "\n);"

    return sql_query

# Generate the SQL CREATE TABLE query based on the predicted column names and types
table_name = "predicted_table"
create_table_query = generate_create_table_query(predicted_column_names_and_types, table_name)

# Output the predicted column names, data types, and the SQL query
print("\nPredicted Column Names and Types:")
for col_name, col_type in predicted_column_names_and_types:
    print(f"{col_name}: {col_type}")

print("\nGenerated SQL CREATE TABLE Query:")
print(create_table_query)



Predicted Column Names and Types:
Account Number: VARCHAR(255)
Date of Birth: DATE
Phone Number: VARCHAR(255)
PERS_ID: TEXT

Generated SQL CREATE TABLE Query:
CREATE TABLE predicted_table (
    account_number VARCHAR(255) NOT NULL,
    date_of_birth DATE NOT NULL,
    phone_number VARCHAR(255) NOT NULL,
    pers_id TEXT NOT NULL
);


Show Diagram Feature

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get the feature importances from the trained model
importances = model.named_steps['randomforestclassifier'].feature_importances_

# Get the feature names from the TF-IDF vectorizer (vocabulary)
vectorizer = model.named_steps['tfidfvectorizer']
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for feature importances and their corresponding feature names
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances using seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance in Random Forest Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

Show Diagram Tree

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz
import graphviz

# Get the trained Random Forest model
rf_model = model.named_steps['randomforestclassifier']

# Loop through the first 3 trees in the Random Forest and visualize them
for i in range(3):  # You can change this number to visualize more trees
    tree = rf_model.estimators_[i]

    # Export tree to DOT format
    dot_data = export_graphviz(tree,
                               feature_names=model.named_steps['tfidfvectorizer'].get_feature_names_out(),
                               filled=True,
                               max_depth=3,  # Limit the depth of the tree for better readability
                               impurity=False,
                               proportion=True)

    # Generate and display the tree graph
    graph = graphviz.Source(dot_data)
    graph.render(f"tree_{i}.pdf", view=True)  # Optionally save the graph as a PDF and open it
    display(graph)  # Display the graph inline (works in Jupyter Notebook)

Random Forest with Training Dataset from File

  SOURCE_FIELD_NAME  FIELD_NAME
0           CUST_NO     PERS_ID
1          OBL_CODE     PERS_ID
2          ACCT_NBR     PERS_ID
3           CUST_NO     PERS_ID
4         CUST_TYPE  ID_SUB_CTG


In [19]:



# Split the dataset into features (X) and target (y)
X = dataFrame['SOURCE_FIELD_NAME']
y = dataFrame['FIELD_NAME']

dataFrame.head()


Unnamed: 0,SOURCE_FIELD_NAME,FIELD_NAME
0,CUST_NO,PERS_ID
1,OBL_CODE,PERS_ID
2,ACCT_NBR,PERS_ID
3,CUST_NO,PERS_ID
4,CUST_TYPE,ID_SUB_CTG


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

dataFrame = pd.read_csv('dataset-clean.csv')


# Define new column name variations for prediction
new_column_names = ['CUST_TYPE', 'DCIF', 'cust_no']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create a pipeline that first transforms the text data and then applies a classifier
model = make_pipeline(
    TfidfVectorizer(),  # Converts column names into numerical features
    RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest Classifier
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Now, let's use the trained model to predict new column names
predicted_names = model.predict(new_column_names)

# Function to predict SQL data type based on the column name
def predict_data_type(column_name):
    column_name = column_name.lower()

    if 'account' in column_name or 'phn' in column_name or 'number' in column_name:
        return 'VARCHAR(255)'  # Account Number, Phone Number, etc. are typically VARCHAR
    elif 'date' in column_name or 'dob' in column_name:
        return 'DATE'  # Date fields like Date of Birth are DATE
    elif 'address' in column_name or 'street' in column_name:
        return 'VARCHAR(255)'  # Address fields are VARCHAR
    else:
        return 'TEXT'  # Default to TEXT if no specific pattern is found

# Predict standard names and data types for new column variations
predicted_column_names_and_types = []
for new_column, predicted_name in zip(new_column_names, predicted_names):
    predicted_type = predict_data_type(predicted_name)
    predicted_column_names_and_types.append((predicted_name, predicted_type))

# SQL CREATE TABLE generation based on predicted column names and types
def generate_create_table_query(predicted_column_names_and_types, table_name):
    sql_query = f"CREATE TABLE {table_name} (\n"

    for col_name, col_type in predicted_column_names_and_types:
        # Convert column name to lowercase with underscores (standard SQL style)
        sql_query += f"    {col_name.replace(' ', '_').lower()} {col_type} NOT NULL,\n"

    # Remove the trailing comma for the last column
    sql_query = sql_query.rstrip(',\n') + "\n);"

    return sql_query

# Generate the SQL CREATE TABLE query based on the predicted column names and types
table_name = "predicted_table"
create_table_query = generate_create_table_query(predicted_column_names_and_types, table_name)

# Output the predicted column names, data types, and the SQL query
print("\nPredicted Column Names and Types:")
for col_name, col_type in predicted_column_names_and_types:
    print(f"{col_name}: {col_type}")

print("\nGenerated SQL CREATE TABLE Query:")
print(create_table_query)


Model Accuracy: 40.52%

Predicted Column Names and Types:
CUST_TY: TEXT
GCIF_NO: TEXT
PERS_ID: TEXT

Generated SQL CREATE TABLE Query:
CREATE TABLE predicted_table (
    cust_ty TEXT NOT NULL,
    gcif_no TEXT NOT NULL,
    pers_id TEXT NOT NULL
);


OPTIMIZE

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import numpy as np

# Load dataset
dataFrame = pd.read_csv('dataset-clean.csv')

# Define your feature set and target
# Assuming 'X' contains the feature columns and 'y' contains the target column.
# Make sure your feature columns are of type 'string' for text-based features.
X = dataFrame['SOURCE_FIELD_NAME'].astype(str)  # Replace 'column_name' with the actual name of your feature column
y = dataFrame['FIELD_NAME']  # Replace 'target_column' with the actual name of your target column

print(dataFrame.head())

  SOURCE_FIELD_NAME  FIELD_NAME
0           CUST_NO     PERS_ID
1          OBL_CODE     PERS_ID
2          ACCT_NBR     PERS_ID
3           CUST_NO     PERS_ID
4         CUST_TYPE  ID_SUB_CTG


In [99]:
X = X.astype(str)
X

0             CUST_NO
1            OBL_CODE
2            ACCT_NBR
3             CUST_NO
4           CUST_TYPE
            ...      
3468      MT_STATE_CD
3469           CIF_ID
3470          GCIF_NO
3471               NM
3472    DT_REGISTERED
Name: SOURCE_FIELD_NAME, Length: 3473, dtype: object

In [11]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'X' is a pandas DataFrame containing text columns
vectorizer = TfidfVectorizer()
X_transformed = vectorizer.fit_transform(X)  # Replace 'text_column' with your actual text column
Y_transformed = vectorizer.fit_transform(y)  # Replace 'text_column' with your actual text column
X_test_transformed = vectorizer.fit_transform(X_test)  # Replace 'text_column' with your actual text column



In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('classifier', RandomForestClassifier())  # Use the appropriate classifier
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X, y)  # Fit on the transformed features




ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\sklearn\utils\_array_api.py", line 832, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "d:\Source Code\project\random-forest-predict\venv\lib\site-packages\pandas\core\series.py", line 1031, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'CUST_NO'


In [14]:
# Access best parameters and the model
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")
# Display the best score achieved by GridSearchCV
print("Best Score:", grid_search.best_score_)


Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}
Best estimator: Pipeline(steps=[('classifier', RandomForestClassifier(n_estimators=200))])
Best Score: 0.39101859722596555


In [None]:
print(X_test_transformed)

In [21]:
# Make predictions on the test set
y_pred = grid_search.predict(X_test_transformed)

# Evaluate the model's performance
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Model Accuracy: {accuracy * 100:.2f}%")

ValueError: X has 305 features, but RandomForestClassifier is expecting 2228 features as input.

In [18]:
import numpy as np
# Predicting new column names (ensure that new_column_names is in a suitable format)
new_column_names = ['CUST_TYPE', 'DCIF', 'cust_no']
new_column_names = [str(col) for col in new_column_names]  # Ensure all column names are strings

# Reshape new_column_names to 2D (1 sample per row)
new_column_names_reshaped = np.array(new_column_names).reshape(-1, 1)

new_column_names_reshaped_transformed = vectorizer.fit_transform(new_column_names_reshaped)  # Replace 'text_column' with your actual text column
new_column_names_reshaped_transformed
# Predict column names using the trained model
# predicted_names = grid_search.predict(new_column_names_reshaped_transformed)


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.pipeline import Pipeline

# Load dataset
dataFrame = pd.read_csv('dataset-clean.csv')

# Define your feature set and target
# Assuming 'X' contains the feature columns and 'y' contains the target column.
# Make sure your feature columns are of type 'string' for text-based features.
X = dataFrame['SOURCE_FIELD_NAME'].astype(str)  # Replace 'column_name' with the actual name of your feature column
y = dataFrame['FIELD_NAME']  # Replace 'target_column' with the actual name of your target column

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Reshape the data to 2D if it's a single column (1D)
X_train = X_train.values.reshape(-1, 1)  # Reshaping to 2D (number_of_samples, 1_feature)
X_test = X_test.values.reshape(-1, 1)  # Reshaping to 2D (number_of_samples, 1_feature)

# Create a pipeline that first transforms the text data and then applies a classifier
model = make_pipeline(
    SimpleImputer(strategy='most_frequent'),  # Handle missing values if any
    TfidfVectorizer(),  # Converts text into numerical features
    RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest Classifier
)

# Define parameter grid for grid search
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [10, 20, None],
    'randomforestclassifier__min_samples_split': [2, 5]
}

# Set up GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model with grid search
grid_search.fit(X_train, y_train)

# Access best parameters and the model
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits




ValueError: 
All the 36 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 2104, in fit_transform
    X = super().fit_transform(raw_documents)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 1376, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 1263, in _count_vocab
    for feature in analyze(doc):
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 104, in _analyze
    doc = preprocessor(doc)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 62, in _preprocess
    doc = doc.lower()
AttributeError: 'numpy.ndarray' object has no attribute 'lower'


In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import numpy as np

# Load dataset
dataFrame = pd.read_csv('dataset-clean.csv')

# Define your feature set and target
# Assuming 'X' contains the feature columns and 'y' contains the target column.
# Make sure your feature columns are of type 'string' for text-based features.
X = dataFrame['SOURCE_FIELD_NAME'].astype(str)  # Replace 'column_name' with the actual name of your feature column
y = dataFrame['FIELD_NAME']  # Replace 'target_column' with the actual name of your target column

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Reshape the data to 2D if it's a single column (1D)
X_train = X_train.values.reshape(-1, 1)  # Reshaping to 2D (number_of_samples, 1_feature)
X_test = X_test.values.reshape(-1, 1)  # Reshaping to 2D (number_of_samples, 1_feature)

# Create a pipeline that first transforms the text data and then applies a classifier
model = make_pipeline(
    SimpleImputer(strategy='most_frequent'),  # Handle missing values if any
    TfidfVectorizer(),  # Converts text into numerical features
    RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest Classifier
)

# Define parameter grid for grid search
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [10, 20, None],
    'randomforestclassifier__min_samples_split': [2, 5]
}

# Set up GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model with grid search
grid_search.fit(X_train, y_train)

# Access best parameters and the model
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

# Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Predicting new column names (ensure that new_column_names is in a suitable format)
new_column_names = ['CUST_TYPE', 'DCIF', 'cust_no']
new_column_names = [str(col) for col in new_column_names]  # Ensure all column names are strings

# Reshape new_column_names to 2D (1 sample per row)
new_column_names_reshaped = np.array(new_column_names).reshape(-1, 1)

# Predict column names using the trained model
predicted_names = grid_search.predict(new_column_names_reshaped)

# Function to predict SQL data type based on the column name
def predict_data_type(column_name):
    column_name = column_name.lower()

    if 'account' in column_name or 'phn' in column_name or 'number' in column_name:
        return 'VARCHAR(255)'  # Account Number, Phone Number, etc. are typically VARCHAR
    elif 'date' in column_name or 'dob' in column_name:
        return 'DATE'  # Date fields like Date of Birth are DATE
    elif 'address' in column_name or 'street' in column_name:
        return 'VARCHAR(255)'  # Address fields are VARCHAR
    else:
        return 'TEXT'  # Default to TEXT if no specific pattern is found

# Predict standard names and data types for new column variations
predicted_column_names_and_types = []
for new_column, predicted_name in zip(new_column_names, predicted_names):
    predicted_type = predict_data_type(predicted_name)
    predicted_column_names_and_types.append((predicted_name, predicted_type))

# SQL CREATE TABLE generation based on predicted column names and types
def generate_create_table_query(predicted_column_names_and_types, table_name):
    sql_query = f"CREATE TABLE {table_name} (\n"

    for col_name, col_type in predicted_column_names_and_types:
        # Convert column name to lowercase with underscores (standard SQL style)
        sql_query += f"    {col_name.replace(' ', '_').lower()} {col_type} NOT NULL,\n"

    # Remove the trailing comma for the last column
    sql_query = sql_query.rstrip(',\n') + "\n);"

    return sql_query

# Generate the SQL CREATE TABLE query based on the predicted column names and types
table_name = "predicted_table"
create_table_query = generate_create_table_query(predicted_column_names_and_types, table_name)

# Output the predicted column names, data types, and the SQL query
print("\nPredicted Column Names and Types:")
for col_name, col_type in predicted_column_names_and_types:
    print(f"{col_name}: {col_type}")

print("\nGenerated SQL CREATE TABLE Query:")
print(create_table_query)


Fitting 3 folds for each of 12 candidates, totalling 36 fits




ValueError: 
All the 36 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 2104, in fit_transform
    X = super().fit_transform(raw_documents)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 1376, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 1263, in _count_vocab
    for feature in analyze(doc):
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 104, in _analyze
    doc = preprocessor(doc)
  File "c:\Users\Djanuar\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 62, in _preprocess
    doc = doc.lower()
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
