# Trainning

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h2o

In [24]:
df = pd.read_csv('final_dataframe.csv')
df = df.drop(columns=['district'])

df['Native People'] = df['Native People'].astype(str).str.replace(',', '').astype(int)
df['Unemployment Rate'] = df['Unemployment Rate'].astype(str).str.replace(',', '').astype(float)
df['Mean household income'] = df['Mean household income'].astype(str).str.replace(',', '').astype(int)
df['Families income below poverty'] = df['Families income below poverty'].astype(str).str.replace(',', '').astype(float)
df['Graduate or professional degree'] = df['Graduate or professional degree'].astype(str).str.replace(',', '').astype(float)
df['Hispanic'] = df['Hispanic'].astype(str).str.replace(',', '').astype(int)
df['Male'] = df['Male'].astype(str).str.replace(',', '').astype(int)
df['Female'] = df['Female'].astype(str).str.replace(',', '').astype(int)

df.head()

Unnamed: 0,senate_candidate1,senate_candidate2,senate_total_votes,house_democrat,house_republican,house_total_votes,presidential_democrat,presidential_republican,presidential_total_votes,Native People,Unemployment Rate,Mean household income,Families income below poverty,Graduate or professional degree,Hispanic,Male,Female
0,920478,1392076,2316445,123233.0,67416,193940,849624,1441170,2323282,688852,6.2,68392,17.1,44204.0,715132,338340,376792
1,146068,16806,354587,128553.0,67866,263610,153778,189951,359530,678835,4.8,114201,7.2,60116.0,733406,385855,347551
2,28,1637661,3355307,179141.0,182336,361477,1672143,1661686,3387326,700868,3.6,146662,6.2,123560.0,804256,393913,410343
3,399390,793871,1193261,86887.0,147975,246446,423932,760647,1219069,742344,3.1,89344,10.2,67783.0,774027,375240,398787
4,6019422,5093942,11113364,168816.0,99819,268635,11110250,6006429,17500881,607938,4.3,163873,5.2,149338.0,781251,391721,389530


In [None]:
# Initialize H2O
h2o.init()

# Set the response column and predictor columns
df['house_democrat_ratio'] = df['house_democrat'] / (df['house_democrat'] + df['house_republican'])
response = 'house_democrat_ratio'
predictors = ['Native People', 'Unemployment Rate', 'Mean household income', 'Families income below poverty', 'Graduate or professional degree', 'Hispanic', 'Male', 'Female']

# Convert the pandas dataframe to H2OFrame
h2o_df = h2o.H2OFrame(df)

# Split the data into train and test sets
train, test = h2o_df.split_frame(ratios=[0.8], seed=1234)

# Run AutoML
aml = h2o.automl.H2OAutoML(max_runtime_secs=3600, seed=1234)
aml.train(x=predictors, y=response, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
print(lb)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,11 mins 43 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,2 months and 25 days
H2O_cluster_name:,H2O_from_python_alonm_igx4gb
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.339 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
17:23:49.303: AutoML: XGBoost is not available; skipping it.
17:23:49.389: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 36.0.

██████████████████████████████████████████████████████████████| (done) 100%
model_id                                                              rmse        mse       mae     rmsle    mean_residual_deviance
StackedEnsemble_AllModels_6_AutoML_2_20241123_172349              0.214536  0.0460258  0.146439  0.15464                  0.0460258
DeepLearning_grid_1_AutoML_2_20241123_172349_model_7              0.21556   0.0464659  0.144972  0.152881                 0.0464659
DeepLearning_grid_1_AutoML_2_20241123_172349_model_5              0.216757  0.0469837  0.158228  0.153669                 0.0469837
StackedEnsemble_BestOfFamily_5_AutoML_2_20241123_172349           0

In [26]:
import os

# Create the directory if it doesn't exist
output_dir = 'senate_prediction_autoML'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the top 10 models
for i in range(10):
    model = h2o.get_model(lb[i, 'model_id'])
    model_path = h2o.save_model(model=model, path=output_dir, force=True)
    print(f"Model {i+1} saved to: {model_path}")

Model 1 saved to: C:\Users\alonm\Documents\IIT\Fall semester\Online social network analysis\Project\Online-Social-Network-Analysis-Election-Prediction\senate_prediction_autoML\StackedEnsemble_AllModels_6_AutoML_2_20241123_172349
Model 2 saved to: C:\Users\alonm\Documents\IIT\Fall semester\Online social network analysis\Project\Online-Social-Network-Analysis-Election-Prediction\senate_prediction_autoML\DeepLearning_grid_1_AutoML_2_20241123_172349_model_7
Model 3 saved to: C:\Users\alonm\Documents\IIT\Fall semester\Online social network analysis\Project\Online-Social-Network-Analysis-Election-Prediction\senate_prediction_autoML\DeepLearning_grid_1_AutoML_2_20241123_172349_model_5
Model 4 saved to: C:\Users\alonm\Documents\IIT\Fall semester\Online social network analysis\Project\Online-Social-Network-Analysis-Election-Prediction\senate_prediction_autoML\StackedEnsemble_BestOfFamily_5_AutoML_2_20241123_172349
Model 5 saved to: C:\Users\alonm\Documents\IIT\Fall semester\Online social networ