## EDA on Clean Data

In [16]:
import pandas as pd
import numpy as np

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

-----------------

In [18]:
df = pd.read_csv('CleanData/CleanData.csv', index_col=False)
df.sample(n=20, random_state=1)

Unnamed: 0,Year,LocationAbbr,Category,Response,StratificationCategory1,Stratification1,StratificationCategory2,Stratification2,IndicatorID
48151,2016,IN,Demographics,Widowed,Disability Type,Self-care Disability,,,MARRIED
693,2016,AR,Health Risks & Behaviors,Yes,Disability Status,No Disability,Sex,Male,QUITSMK
105060,2016,VI,Mental & Emotional Health,14+ Days,Disability Status,No Disability,Sex,Male,MHDAYS
307292,2019,PR,Mental & Emotional Health,14+ Days,Disability Type,Cognitive Disability,,,MHDAYS
206330,2017,SD,Chronic Conditions,No,Disability Status,No Disability,Race/Ethnicity,"White, non-Hispanic",OTHCAN
55632,2016,MA,Prevention & Screenings,No,Disability Status,Any Disability,Age Group,36-50,UTDCERV
696487,2022,TN,Prevention & Screenings,Yes,Disability Status,No Disability,,,FLUVAC
223369,2017,VA,Demographics,"$35,000 to <$50,000",Disability Type,Cognitive Disability,,,INCOMEN
741328,2022,PR,Mental & Emotional Health,No,Disability Status,Any Disability,Age Group,65+,DEPRESS
109925,2017,AL,Chronic Conditions,Yes,Disability Status,Any Disability,Age Group,18-44,COPD


In [19]:
df['Stratification1'].value_counts()

Stratification1
Any Disability                   278336
No Disability                    278336
Cognitive Disability              40781
Hearing Disability                40781
Mobility Disability               40781
Vision Disability                 40781
Self-care Disability              13842
Independent Living Disability     13842
Overall                            3592
Name: count, dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751072 entries, 0 to 751071
Data columns (total 9 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Year                     751072 non-null  int64 
 1   LocationAbbr             751072 non-null  object
 2   Category                 751072 non-null  object
 3   Response                 740296 non-null  object
 4   StratificationCategory1  751072 non-null  object
 5   Stratification1          751072 non-null  object
 6   StratificationCategory2  475110 non-null  object
 7   Stratification2          475110 non-null  object
 8   IndicatorID              751072 non-null  object
dtypes: int64(1), object(8)
memory usage: 51.6+ MB


In [21]:
df = df.sample(n=10000, random_state=42)

--------------------

In [22]:
X = df.drop('Stratification1', axis=1)
y = df['Stratification1']

In [23]:
categorical_cols = X.select_dtypes(include=['object']).columns

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ])

In [25]:
# Set up the overall pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Feature preprocessing
    ('classifier', LogisticRegression(random_state=42))  # Replace with your model
])

In [26]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Now, we need to apply LabelEncoding to the target variable (y)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [28]:
# Define hyperparameters to tune using GridSearchCV
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__penalty': ['l2', 'l1'],  # Type of regularization ('l2' is standard)
    'classifier__solver': ['liblinear', 'saga'],  # Solver options
    'classifier__max_iter': [10000]  # Max iterations for convergence
}



In [29]:
# Perform GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

In [30]:
# Fit the pipeline (this will apply OneHotEncoder to X_train and train the model)
grid_search.fit(X_train, y_train_encoded)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [31]:
# Get the best hyperparameters from GridSearchCV
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'classifier__C': 1, 'classifier__max_iter': 10000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}


In [32]:
# Evaluate the model (e.g., predicting on X_test)
y_pred = grid_search.predict(X_test)

In [33]:
accuracy = accuracy_score(y_test_encoded, y_pred)

In [34]:
# Print results
print("Predictions:", y_pred)
print("Test set accuracy:", accuracy)

Predictions: [5 5 5 ... 0 5 5]
Test set accuracy: 0.438
