# Data Normalization and one-hot encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import mysql.connector

In [18]:
# Connect to MySQL database
conn = mysql.connector.connect(
    host="localhost",
    user="monty",
    password="sushiSQL",
    database="HealthCareAnalytics"
)
cursor = conn.cursor()

# Read data from the MySQL table into a pandas DataFrame
query = "SELECT * FROM cdc_cigar_use"
df = pd.read_sql(query, conn)

# Define numerical and categorical columns
numerical_cols = ['YEAR', 'Greater_Risk_Data_Value', 'Greater_Risk_Low_Confidence_Limit', 'Greater_Risk_High_Confidence_Limit', 'Lesser_Risk_Data_Value', 'Lesser_Risk_Low_Confidence_Limit', 'Lesser_Risk_High_Confidence_Limit', 'Sample_Size']
categorical_cols = ['LocationAbbr', 'LocationDesc', 'DataSource', 'Topic', 'Subtopic', 'ShortQuestionText', 'Greater_Risk_Question', 'Description', 'Data_Value_Symbol', 'Data_Value_Type', 'Greater_Risk_Data_Value_Footnote_Symbol', 'Greater_Risk_Data_Value_Footnote', 'Lesser_Risk_Question', 'Lesser_Risk_Data_Value_Footnote_Symbol', 'Lesser_Risk_Data_Value_Footnote', 'Sex', 'Race', 'Grade', 'SexualIdentity', 'SexOfSexualContacts', 'GeoLocation', 'TopicId', 'SubTopicID', 'QuestionCode', 'LocationId', 'StratID1', 'StratID2', 'StratID3', 'StratID4', 'StratID5', 'StratificationType', 'StratID6']

if not df.empty and categorical_cols:
    # Initialize MinMaxScaler
    scaler = MinMaxScaler()
    
    # Fit and transform numerical columns if they exist in the DataFrame
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Perform one-hot encoding for categorical columns
    encoder = OneHotEncoder(handle_unknown='ignore')
    df_encoded = encoder.fit_transform(df[categorical_cols])

    # Convert the sparse matrix to a dense array
    df_encoded_dense = df_encoded.toarray()

    # Concatenate the scaled numerical columns and the one-hot encoded categorical columns
    df_processed = pd.concat([df[numerical_cols], pd.DataFrame(df_encoded_dense)], axis=1)


    # Update the MySQL table with the normalized and one-hot encoded data
    for _, row in df_processed.iterrows():
        update_query = (
            "UPDATE cdc_cigar_use SET "
            "LocationAbbr = %s, LocationDesc = %s, DataSource = %s, "
            "Topic = %s, Subtopic = %s, ShortQuestionText = %s, Greater_Risk_Question = %s, "
            "Description = %s, Data_Value_Symbol = %s, Data_Value_Type = %s, "
            "Greater_Risk_Data_Value = %s, Greater_Risk_Data_Value_Footnote_Symbol = %s, "
            "Greater_Risk_Data_Value_Footnote = %s, Greater_Risk_Low_Confidence_Limit = %s, "
            "Greater_Risk_High_Confidence_Limit = %s, Lesser_Risk_Question = %s, "
            "Lesser_Risk_Data_Value = %s, Lesser_Risk_Data_Value_Footnote_Symbol = %s, "
            "Lesser_Risk_Data_Value_Footnote = %s, Lesser_Risk_Low_Confidence_Limit = %s, "
            "Lesser_Risk_High_Confidence_Limit = %s, Sample_Size = %s, Sex = %s, Race = %s, "
            "Grade = %s, SexualIdentity = %s, SexOfSexualContacts = %s, GeoLocation = %s, "
            "TopicId = %s, SubTopicID = %s, QuestionCode = %s, LocationId = %s, StratID1 = %s, "
            "StratID2 = %s, StratID3 = %s, StratID4 = %s, StratID5 = %s, StratificationType = %s, "
            "StratID6 = %s "
            "WHERE YEAR = %s"  # Assuming YEAR is the primary key column name
        )
        # Extract the YEAR from the row
        row_year = row['YEAR']
        # Remove the YEAR from the row before passing it to cursor.execute()
        del row['YEAR']
        cursor.execute(update_query, tuple(row.values.tolist()) + (row_year,))

else:
    print("DataFrame is empty or numerical_cols or categorical_cols are not specified.")

# Commit the changes and close the connection
conn.commit()
conn.close()

print("Data updated successfully!")


  df = pd.read_sql(query, conn)


ProgrammingError: Not all parameters were used in the SQL statement