In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
train_data = pd.read_csv('training_data.csv')
test_data = pd.read_csv('test_data.csv')

In [4]:
import pandas as pd

# Assuming train_data and test_data are already defined and include a column named 'date_index_converted'
# that looks something like 'day_1', 'day_2', etc.

# Step 1: Remove the 'day_' prefix and convert to integer, handling potential errors
train_data['date_int'] = train_data['date_index_converted'].str.replace('day_', '').apply(lambda x: pd.to_numeric(x, errors='coerce'))
test_data['date_int'] = test_data['date_index_converted'].str.replace('day_', '').apply(lambda x: pd.to_numeric(x, errors='coerce'))

# Optional: Convert to pandas' nullable integer type if you want to explicitly handle missing values
train_data['date_int'] = train_data['date_int'].astype('Int64')
test_data['date_int'] = test_data['date_int'].astype('Int64')


In [5]:
county_cols = ['date_int', 'county', 'cases', 'deaths', 'date_index_converted', 'county_data_length', 'total_pop', 'percent_25_34', 'percent_highschool', 'labor_force_rate', 'unemployment_rate', 'median_housing_cost', 'median_household_earnings', 'median_worker_earnings', 'percent_insured', 'percent_married', 'poverty_rate', 'median_property_value', 'percent_white']

In [6]:
corr = train_data[county_cols].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,date_int,cases,deaths,county_data_length,total_pop,percent_25_34,percent_highschool,labor_force_rate,unemployment_rate,median_housing_cost,median_household_earnings,median_worker_earnings,percent_insured,percent_married,poverty_rate,median_property_value,percent_white
date_int,1.0,0.26,0.29,0.01,-0.01,0.01,-0.03,-0.01,0.01,-0.03,-0.03,-0.03,-0.02,-0.02,0.01,-0.02,0.01
cases,0.26,1.0,0.77,0.24,0.34,0.24,0.05,0.07,0.06,0.1,0.03,0.06,0.02,-0.19,0.03,0.06,-0.31
deaths,0.29,0.77,1.0,0.12,0.28,0.15,0.06,0.09,0.07,0.07,0.01,0.04,0.03,-0.18,0.02,0.04,-0.27
county_data_length,0.01,0.24,0.12,1.0,0.89,0.59,0.09,0.2,0.16,0.2,-0.0,0.11,0.02,-0.44,0.12,0.09,-0.75
total_pop,-0.01,0.34,0.28,0.89,1.0,0.63,0.17,0.29,0.14,0.35,0.11,0.22,0.07,-0.49,0.06,0.21,-0.88
percent_25_34,0.01,0.24,0.15,0.59,0.63,1.0,-0.05,0.01,0.18,0.11,-0.1,-0.03,-0.02,-0.55,0.19,-0.02,-0.66
percent_highschool,-0.03,0.05,0.06,0.09,0.17,-0.05,1.0,0.4,-0.24,0.42,0.42,0.48,0.86,0.04,-0.43,0.25,-0.23
labor_force_rate,-0.01,0.07,0.09,0.2,0.29,0.01,0.4,1.0,-0.47,0.62,0.65,0.71,0.12,0.28,-0.65,0.61,-0.2
unemployment_rate,0.01,0.06,0.07,0.16,0.14,0.18,-0.24,-0.47,1.0,-0.39,-0.64,-0.63,0.03,-0.68,0.77,-0.58,-0.23
median_housing_cost,-0.03,0.1,0.07,0.2,0.35,0.11,0.42,0.62,-0.39,1.0,0.89,0.85,0.23,0.16,-0.57,0.91,-0.33


In [7]:
len(county_cols)

19

In [8]:
county_cols_to_use = ['date_int', 'total_pop', 'deaths', 'county_data_length', 'percent_25_34', 'labor_force_rate', 'unemployment_rate', 'median_household_earnings']
len(county_cols_to_use)

8

In [9]:
social_features_string = 'core_cosine, core_cosine_normalized, core_intersection, core_intersection_nor malized, core_jaccard, core_jaccard_normalized, domestic_cosine, domestic_cos ine_normalized, domestic_intersection, domestic_intersection_normalized, dome stic_jaccard, domestic_jaccard_normalized, economy_cosine, economy_cosine_nor malized, economy_intersection, economy_intersection_normalized, economy_jacca rd, economy_jaccard_normalized, education_cosine, education_cosine_normalized , education_intersection, education_intersection_normalized, education_jaccar d, education_jaccard_normalized, entertainment_cosine, entertainment_cosine_n ormalized, entertainment_intersection, entertainment_intersection_normalized, entertainment_jaccard, entertainment_jaccard_normalized, foreign_cosine, fore ign_cosine_normalized, foreign_intersection, foreign_intersection_normalized, foreign_jaccard, foreign_jaccard_normalized, gender_cosine, gender_cosine_nor malized, gender_intersection, gender_intersection_normalized, gender_jaccard, gender_jaccard_normalized, health_cosine, health_cosine_normalized, health_in tersection, health_intersection_normalized, health_jaccard, health_jaccard_no rmalized, health_technology_cosine, health_technology_cosine_normalized, heal th_technology_intersection, health_technology_intersection_normalized, health _technology_jaccard, health_technology_jaccard_normalized, ideology_cosine, i deology_cosine_normalized, ideology_intersection, ideology_intersection_norma lized, ideology_jaccard, ideology_jaccard_normalized, illness_cosine, illness _cosine_normalized, illness_intersection, illness_intersection_normalized, il lness_jaccard, illness_jaccard_normalized, labor_force_rate, median_household _earnings, median_housing_cost, median_property_value, median_worker_earning, nationalistic_cosine, nationalistic_cosine_normalized, nationalistic_intersec tion, nationalistic_intersection_normalized, nationalistic_jaccard, nationali stic_jaccard_normalized, percent_25_34, percent_highschool, percent_insure, p ercent_married, percent_white, politics_cosine, politics_cosine_normalized, p olitics_democratic_hate_cosine, politics_democratic_hate_cosine_normalized, p olitics_democratic_hate_intersection, politics_democratic_hate_intersection_n ormalized, politics_democratic_hate_jaccard, politics_democratic_hate_jaccard _normalized, politics_democratic_love_cosine, politics_democratic_love_cosine _normalized, politics_democratic_love_intersection, politics_democratic_love_ intersection_normalized, politics_democratic_love_jaccard, politics_democrati c_love_jaccard_normalized, politics_intersection, politics_intersection_norma lized, politics_jaccard, politics_jaccard_normalized, politics_republican_hat e_cosine, politics_republican_hate_cosine_normalized, politics_republican_hat e_intersection, politics_republican_hate_intersection_normalized, politics_re publican_hate_jaccard, politics_republican_hate_jaccard_normalized, politics_ republican_love_cosine, politics_republican_love_cosine_normalized, politics_ republican_love_intersection, politics_republican_love_intersection_normalize d, politics_republican_love_jaccard, politics_republican_love_jaccard_normali zed, poverty_rate, race_cosine, race_cosine_normalized, race_intersection, ra ce_intersection_normalized, race_jaccard, race_jaccard_normalized, religion_c osine, religion_cosine_normalized, religion_intersection, religion_intersecti on_normalized, religion_jaccard, religion_jaccard_normalized, social_cosine, social_cosine_normalized, social_intersection, social_intersection_normalized , social_jaccard, social_jaccard_normalized, sports_cosine, sports_cosine_nor malized, sports_intersection, sports_intersection_normalized, sports_jaccard, sports_jaccard_normalized'

social_features_string = social_features_string.replace(" ","")
social_features = social_features_string.split(",")
social_features = [each for each in social_features_string.split(",")]

for i in range(len(social_features)):
    if social_features[i] == 'median_worker_earning':
        social_features[i] = 'median_worker_earnings'
    elif social_features[i] == 'percent_insure':
        social_features[i] = 'percent_insured'
    else:
        pass
len(social_features)

137

In [10]:
awareness_cols_to_use = ['core_intersection','health_technology_cosine_normalized', 'politics_democratic_hate_intersection', 'race_cosine']

In [11]:
cols_to_use = county_cols_to_use + awareness_cols_to_use

In [12]:
X = train_data[cols_to_use]
y = train_data['cases']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 256)

In [None]:
X

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print('Random Forest Regression -',r2_score(y_test, y_pred_rf))

Random Forest Regression - 0.8967105639697988


In [14]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

y_pred_gbr = gbr.predict(X_test)

print('Gradient Boosting Regression -',r2_score(y_test, y_pred_gbr))

Gradient Boosting Regression - 0.9117811321854435


In [15]:
import xgboost as xg

xgb = xg.XGBRegressor()

xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print('XGBoost Regression -',r2_score(y_test, y_pred_xgb))

XGBoost Regression - 0.9127768979626822


In [16]:
from sklearn.ensemble import ExtraTreesRegressor

etr = ExtraTreesRegressor()

etr.fit(X_train, y_train)

y_pred_etr = etr.predict(X_test)

print('Extra Trees Regression -',r2_score(y_test, y_pred_etr))

Extra Trees Regression - 0.9718365607762725


In [17]:
gbr.fit(X,y)
xgb.fit(X,y)
etr.fit(X,y)

ExtraTreesRegressor()

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

# Assuming X and y are already defined:
# X = train_data[features]
# y = train_data['cases']

# Splitting the dataset into training and testing sets
# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1)
])

# Compiling the model
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mae'])

# Training the model
history = model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=100, batch_size=32)

# Evaluating the model
model.evaluate(X_test_scaled, y_test)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

[189577.421875, 106.21034240722656]

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Assuming your data is already preprocessed and split into X_train_scaled, X_test_scaled, y_train, and y_test

# Model architecture
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1)
])

# Compile the model
optimizer = Adam(learning_rate=0.01)  # Experiment with different learning rates
model.compile(optimizer=optimizer,
              loss='mean_squared_error',
              metrics=['mae'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_scaled, y_train,
                    validation_split=0.2,
                    epochs=100,
                    batch_size=32,
                    callbacks=[early_stopping])

# Evaluate the model
model.evaluate(X_test_scaled, y_test)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


[173221.125, 90.55139923095703]

In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model = Sequential([
    LSTM(256, input_shape=(1, X_train.shape[1]), return_sequences=True),  # Adjust input_shape accordingly
    Dropout(0.5),
    LSTM(128),
    Dropout(0.5),
    Dense(1)
])

# Compile, fit, and evaluate the model as before


In [67]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')


In [68]:
# Convert DataFrame to NumPy array
X_train_array = X_train.values  # or X_train.to_numpy()

# Now you can use .reshape() on the NumPy array
# Assuming you want to reshape for an LSTM layer expecting 1 timestep and multiple features
X_train_reshaped = X_train_array.reshape((X_train_array.shape[0], 1, X_train_array.shape[1]))

# Do the same for X_test if needed
X_test_array = X_test.values  # or X_test.to_numpy()
X_test_reshaped = X_test_array.reshape((X_test_array.shape[0], 1, X_test_array.shape[1]))


In [70]:
# Assuming X_train, y_train are your training data and labels, respectively
model = model((X_train.shape[0], X_train.shape[1]))  # Input shape is (timesteps, features)
history = model.fit(X_train, y_train, epochs=30, batch_size=72, validation_split=0.2, verbose=1)


ValueError: Layer "sequential_26" expects 1 input(s), but it received 2 input tensors. Inputs received: [<tf.Tensor: shape=(), dtype=int32, numpy=2512>, <tf.Tensor: shape=(), dtype=int32, numpy=12>]

In [None]:
# Evaluate your model's performance on the test set
model.evaluate(X_test, y_test)


In [40]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Assuming you have a DataFrame `df` with a column 'cases' as your target

# Step 1: Preprocess Your Data
# data = df['cases'].values
# data = data.reshape((-1, 1))  # Reshape for scaling
# scaler = MinMaxScaler(feature_range=(0, 1))
# data_normalized = scaler.fit_transform(data)

# Define your sequence length
sequence_length = 5

generator = TimeseriesGenerator(X_train, y_train, length=sequence_length, batch_size=1)

# Step 2: Build the LSTM Model
model = Sequential([
    LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
    Dense(1)
])

# Step 3: Compile and Train the Model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(generator, epochs=20)

# Step 4: Make Predictions (You'll need to prepare your test data similarly to how you prepared your training data)
# This is just a placeholder to show where predictions would be made
predicted_cases = model.predict(generator)  # Replace `generator` with your actual test data generator

# Inverse transform to get actual case numbers
predicted_cases = scaler.inverse_transform(predicted_cases)




ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [18]:
# gbr.fit(train_data[cols_to_use],y)
xgb.fit(train_data[cols_to_use],y)
# etr.fit(train_data[cols_to_use],y)

# test_pred = gbr.predict(test_data[cols_to_use])
test_pred = xgb.predict(test_data[cols_to_use])
# test_pred = etr.predict(test_data[cols_to_use])


out = pd.DataFrame(test_pred, columns=['Cases'])

for i in range(len(out)):
    if out['Cases'][i] < 0:
        out['Cases'][i] = 0

out['Index'] = out.index

out['Cases'] = np.floor(pd.to_numeric(out['Cases'], errors='coerce')).astype('Int64')

out = out[['Index', 'Cases']]
out

Unnamed: 0,Index,Cases
0,0,1
1,1,428
2,2,0
3,3,0
4,4,0
...,...,...
7326,7326,0
7327,7327,103
7328,7328,0
7329,7329,517


In [21]:
out.to_csv("Final.csv")

In [19]:
for i in range(len(xgb.feature_names_in_)):
    print(xgb.feature_names_in_[i], "       ",xgb.feature_importances_[i]) 

date_int         0.0019541488
total_pop         0.033885073
deaths         0.5588625
county_data_length         0.028772352
percent_25_34         0.28685516
labor_force_rate         0.009185454
unemployment_rate         0.0651842
median_household_earnings         0.008705766
core_intersection         0.00090186356
health_technology_cosine_normalized         0.0010103182
politics_democratic_hate_intersection         0.0041453126
race_cosine         0.0005378772


In [20]:
for i in range(len(xgb.feature_names_in_)):
    print(xgb.feature_names_in_[i], "       ",xgb.feature_importances_[i]) 

date_int         0.0019541488
total_pop         0.033885073
deaths         0.5588625
county_data_length         0.028772352
percent_25_34         0.28685516
labor_force_rate         0.009185454
unemployment_rate         0.0651842
median_household_earnings         0.008705766
core_intersection         0.00090186356
health_technology_cosine_normalized         0.0010103182
politics_democratic_hate_intersection         0.0041453126
race_cosine         0.0005378772
