In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Import the data
pred_states_df = pd.read_csv("states_energy_merged.csv")
pred_states_df.head()

In [None]:
# Dropping consumption column
pred_states_df = pred_states_df.drop(['consumption'], axis=1)

pred_states_df.head()

In [None]:
# Create state data frame test  for Illinois

test_df = pred_states_df.loc[(pred_states_df['Year'] >= 2000) & (pred_states_df['State'] == 'Illinois')]
display(test_df.head())

# display(test_df.tail())
test_df.loc[test_df['Year'] == 2020, 'production'].values[0]

<h1>Multiple Liner Regression Test for Illinois</h1>

In [None]:
# Find future value for only state of Illinois
# Test is run on Produced Renewable Column using simple LR

# Select X & y values
X = test_df[['Year']].values.reshape(-1,1)
y = test_df[['production']].values.reshape(-1,1)

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

In [None]:
# Plot the data 

plt.scatter(X,y)

In [None]:
# Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69)

In [None]:
# Creating a Linear Regression(LR) model

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Calculate the model's scores

trng_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {trng_score}")
print(f"Testing Score: {trng_score}")

In [None]:
# Use the model to make predictions

predicted = model.predict(X_test)

In [None]:
# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


predictions = model.predict(X_test[:5])
print(f"Predicted classes: {predictions}")

n_2025 = model.predict([[2025]])
n_2030 = model.predict([[2030]])
n_2035 = model.predict([[2035]])

print('''''')
print(f"Illinois Renewable Energy Production in 2020: {int(test_df.loc[test_df['Year'] == 2020, 'production'].values[0])}")

print('''''')      
print(f'Prediction for 2025: {int(n_2025[0][0])}')
      
print('''''')      
print(f'Prediction for 2030: {int(n_2030[0][0])}')
      
print('''''')      
print(f'Prediction for 2030: {int(n_2035[0][0])}')

<h1>Finding the Best Scores R-Squared (R2)</h1>

In [None]:
# Import the data
pred_states_df = pd.read_csv("states_energy_merged.csv")

In [None]:

# Dropping consumption column
pred_states_df = pred_states_df.drop(['consumption'], axis=1)

pred_states_df.head()

In [None]:
# Get arrays of states, scores
pred_states = pred_states_df["State"].unique()
rp_r2_scores = []
rp_r2_averages = []

GDP_r2_scores = []
GDP_r2_averages = []

pop_r2_scores = []
pop_r2_averages = []

ep_r2_scores = []
ep_r2_averages = []

In [None]:
# Create mean function
def mean(array):
    length_arr = len(array)
    sum_arr = sum(array)
    
    return sum_arr / length_arr  

In [None]:
# Create the Loop

for n in range(101):
    for state in states:
        
pred_states_df = pred_states_df.loc[(pred_states_df["State"] == state) & (pred_states_df["Year"] >= 1970)]


<h1>Renewable Production</h1>

In [None]:
# Select X & y for Renewable Production (RP) and reshape for each state
rp_X = pred_states_df[['Year']].values.reshape(-1,1)
rp_y = pred_statess_df[['Prod. Renewable (Bil. Btu)']].values.reshape(-1,1)

In [None]:
# Split Renewable Production Data into training & test data
rp_x_train, rp_x_test, rp_y_train, rp_y_test = train_test_split(rp_X, rp_y, random_state= n)

In [None]:
# Create and fit the models for each state for Renewable Production
model = LinearRegression()
model.fit(rp_x_train, rp_y_train)

In [None]:
# Calculate the Renewable Production testing & training scores
rp_training_score = model.score(rp_x_train, rp_y_train)
rp_testing_score = model.score(rp_x_test, rp_y_test)      

In [None]:
# Use the model to make Renewable Production predictions
rp_predicted = model.predict(rp_x_test)

In [None]:
# Score the Renewable Production predictions with mse and r2 for each state
rp_mse = mean_squared_error(rp_y_test, rp_predicted)
rp_r2 = r2_score(rp_y_test, rp_predicted)

In [None]:
# Populate Renewable Production r2 score array for averaging later
rp_r2_scores.append(rp_r2)

<h1>Current-Dollar Gross Domestic Product (GDP)</h1>

In [None]:
# Select X & y for GDP and reshape for each state
gdp_x = pred_states_df[['Year']].values.reshape(-1,1)
gdp_y = pred_states_df[['GDP (Millions)']].values.reshape(-1,1)

In [None]:
# Split GDP Data into training & test data
gdp_x_train, gdp_x_test, gdp_y_train, gdp_y_test = train_test_split(gdp_x, gdp_y, random_state= n)

In [None]:
# Create and fit the models for each state for GDP
model = LinearRegression()
model.fit(gdp_x_train, gdp_y_train)

In [None]:
# Calculate the GDP testing & training scores
gdp_training_score = model.score(gdp_x_train, gdp_y_train)
gdp_testing_score = model.score(gdp_x_test, gdp_y_test)

In [None]:
# Use the model to make GDP predictions
gdp_predicted = model.predict(gdp_x_test)

In [None]:
# Score the GDP predictions with mse and r2 for each state
gdp_mse = mean_squared_error(gdp_y_test, gdp_predicted)
gdp_r2 = r2_score(gdp_y_test, gdp_predicted)

In [None]:
# Populate GDP r2 score array for averaging later
gdp_r2_scores.append(gdp_r2)

<h1>Population</h1>

In [None]:
# Select X & y for Population (Pop.) and reshape for each state
pop_x = pred_states_df[['Year']].values.reshape(-1,1)
pop_y = pred_states_df[['Pop. (1000s)']].values.reshape(-1,1)

In [None]:
# Split Pop. Data into training & test data
pop_x_train, pop_x_test, pop_y_train, pop_y_test = train_test_split(pop_x, pop_y, random_state= n)

In [None]:
# Create and fit the models for each state for pop
model = LinearRegression()
model.fit(pop_X_train, pop_y_train)

In [None]:
# Calculate the Pop. testing & training scores
pop_training_score = model.score(pop_x_train, pop_y_train)
pop_testing_score = model.score(pop_x_test, pop_y_test)

In [None]:
# Use the model to make Pop. predictions
pop_predicted = model.predict(pop_x_test)

In [None]:
# Score the Pop. predictions with mse and r2 for each state
pop_mse = mean_squared_error(pop_y_test, pop_predicted)
pop_r2 = r2_score(pop_y_test, pop_predicted)

In [None]:
# Populate Pop. r2 score array for averaging later
pop_r2_scores.append(pop_r2)

<h1>Energy Price</h1>

In [None]:
# Select X & y for Energy Price and reshape for each state
ep_x = pred_states_df[['Year']].values.reshape(-1,1)
ep_y = pred_states_df[['Energy Pr. ($/mil. btu)']].values.reshape(-1,1)

In [None]:
# Split Energy Price Data into training & test data
ep_x_train, ep_x_test, ep_y_train, ep_y_test = train_test_split(ep_x, ep_y, random_state= n)

In [None]:
# Create and fit the models for each state for Energy Price
model = LinearRegression()
model.fit(ep_X_train, ep_y_train)

In [None]:
# Calculate the Energy Price testing & training scores
ep_training_score = model.score(ep_x_train, ep_y_train)
ep_testing_score = model.score(ep_x_test, ep_y_test)

In [None]:
# Use the model to make Energy Price predictions
ep_predicted = model.predict(ep_x_test)

In [None]:
# Score the Energy Price predictions with mse and r2 for each state
ep_mse = mean_squared_error(ep_y_test, ep_predicted)
ep_r2 = r2_score(ep_y_test, ep_predicted)

In [None]:
# Populate Energy Price r2 score array for averaging later
ep_r2_scores.append(ep_r2)

In [None]:
# Import the data
pred_states_df = pd.read_csv("Resources/????????.csv")

In [None]:
# Dropping consumption column
pred_states = pred_states.drop(['Unnamed: 0'], axis=1)
pred_states_df = pred_states_df.rename(columns={'Produced Renewable(Billion Btu)': 'Prod. Renewable (Bil. Btu)',
                                      'Total Consumed(Billion Btu)': 'Total Consumed (Bil. Btu)',
                                     'GDP(Million dollars)': 'GDP (Millions)', 'Population(Thousand)' : 'Pop. (1000s)',
                                     'Energy Price(dollars per million btu)': 'Energy Pr. ($/mil. btu)'})

In [None]:
#Append average R2 scores for all features to respective lists
rp_r2_averages.append(mean(rp_r2_scores)) 
GDP_r2_averages.append(mean(GDP_r2_scores))
pop_r2_averages.append(mean(pop_r2_scores))
ep_r2_averages.append(mean(ep_r2_scores))       

In [None]:
# Finding best Renewable Production R2 Score
score_index = 0        
 for score in rp_r2_averages:
     if score == max(rp_r2_averages):
         print(f"Best Renewable Production R2 Seed: {score_index}, Best RP R2 Score: {score}")
    
     score_index += 1
    
# Finding best GDP R2 Score
score_index = 0        
 for score in GDP_r2_averages:
     if score == max(GDP_r2_averages):
         print(f"Best GDP R2 Seed: {score_index}, Best GDP R2 Score: {score}")
    
     score_index += 1    

# Finding best Population R2 Score
score_index = 0        
 for score in pop_r2_averages:
     if score == max(pop_r2_averages):
         print(f"Best Population R2 Seed: {score_index}, Best Population R2 Score: {score}")
    
     score_index += 1 
    
# Finding best Energy Price R2 Score
score_index = 0        
 for score in ep_r2_averages:
     if score == max(ep_r2_averages):
         print(f"Best Energy Price R2 Seed: {score_index}, Best Energy Price R2 Score: {score}")
    
     score_index += 1 

<h1>Multiple Liner Regression to Predict State Energy Difference</h1>

In [None]:
# Import the data
pred_states_df = pd.read_csv("Resources/????????.csv")

In [None]:
# Dropping consumption column
pred_states = pred_states.drop(['Unnamed: 0'], axis=1)
pred_states_df = pred_states_df.rename(columns={'Produced Renewable(Billion Btu)': 'Prod. Renewable (Bil. Btu)',
                                      'Total Consumed(Billion Btu)': 'Total Consumed (Bil. Btu)',
                                     'GDP(Million dollars)': 'GDP (Millions)', 'Population(Thousand)' : 'Pop. (1000s)',
                                     'Energy Price(dollars per million btu)': 'Energy Pr. ($/mil. btu)'})

In [None]:
states = pred_states_df["State"].unique()

state_dicts = {}

for state in states:

    df = pred_states_df.loc[pred_states_df['State'] == state]
    
    
    # Population #
    #============#

    x = df['Year'].values.reshape(-1,1)
    y = df['Pop. (1000s)'].values.reshape(-1,1)


    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state= 14)

    # Create and fit the models for each state for RP
    model = LinearRegression()
    model.fit(x_train, y_train)

    # Calculate the RP testing & training scores
    trng_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    # Use our models to make RP predictions
    predicted = model.predict(x_test)

    # Score the ep predictions with mse and r2 for each state
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)

    # Make predictions for RP with MLR
    pop_predictions = []
    for i in range(2021, 2035):
        i_prediction = model.predict([[i]])
        pop_predictions.append(float(i_prediction))
        
        
    df = pred_states_df.loc[pred_states_df['State'] == state]  

    
     # Renewable Production #
    #======================#

    x = df['Year'].values.reshape(-1,1)
    y = df['Prod. Renewable (Bil. Btu)'].values.reshape(-1,1)


    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= 14)

    # Create and fit the models for each state for RP
    model = LinearRegression()
    model.fit(x_train, y_train)

    # Calculate the RP testing & training scores
    trng_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    # Use our models to make RP predictions
    predicted = model.predict(x_test)

    # Score the ep predictions with mse and r2 for each state
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)

    # Make predictions for RP with MLR
    rp_predictions = []
    for i in range(2021, 2035):
        i_prediction = model.predict([[i]])
        rp_predictions.append(float(i_prediction))
        
        
     # Energy Price #
     #==============#

    x = df['Year'].values.reshape(-1,1)
    y = df['Energy Pr. ($/mil. btu)'].values.reshape(-1,1)


    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= 14)

    # Create and fit the models for each state for RP
    model = LinearRegression()
    model.fit(x_train, y_train)

    # Calculate the RP testing & training scores
    trng_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    # Use our models to make RP predictions
    predicted = model.predict(x_test)

    # Score the ep predictions with mse and r2 for each state
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)

    # Make predictions for RP with MLR
    ep_predictions = []
    for i in range(2021, 2035):
        i_prediction = model.predict([[i]])
        ep_predictions.append(float(i_prediction))
        
    
    #  Consumption  #
    #==============#

    X = df['Year'].values.reshape(-1,1)
    y = df['Total Consumed (Bil. Btu)'].values.reshape(-1,1)


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 14)

    # Create and fit the models for each state for RP
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Calculate the RP testing & training scores
    trng_score = model.score(X_train, y_train)
    testing_score = model.score(X_test, y_test)

    # Use our models to make RP predictions
    predicted = model.predict(X_test)

    # Score the ep predictions with mse and r2 for each state
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)

    # Make predictions for RP with MLR
    consume_predictions = []
    for i in range(2018, 2031):
        i_prediction = model.predict([[i]])
        consume_predictions.append(float(i_prediction))
    
    
    
    # Energy Difference #
    #==================#

    x = df['Year'].values.reshape(-1,1)
    y = df['Difference'].values.reshape(-1,1)


    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= 14)

    # Create and fit the models for each state for RP
    model = LinearRegression()
    model.fit(x_train, y_train)

    # Calculate the RP testing & training scores
    trng_score = model.score(x_train, y_train)
    testing_score = model.score(X_test, y_test)

    # Use our models to make RP predictions
    predicted = model.predict(x_test)

    # Score the ep predictions with mse and r2 for each state
    mse = mean_squared_error(y_test, predicted)
    r2 = r2_score(y_test, predicted)

    # Make predictions for RP with MLR
    ed_predictions = []
    for i in range(2021, 2035):
        i_prediction = model.predict([[i]])
        ed_predictions.append(float(i_prediction))
    
        
    # Update States Dictionary
    state_dicts.update({state: {
                        'State': list([state] * len(range(2021, 2035))),
                        'Year': list(range(2021, 2035)),
                        'Prod. Renewable (Bil. Btu)': rp_predictions,
                        'Pop. (1000s)': pop_predictions,
                        'Energy Pr. ($/mil. btu)': ep_predictions,
                        'Total Consumed (Bil. Btu)': consume_predictions,
                        'Difference': ed_predictions
                        }})
pred_states_df
pred_state_dicts

In [None]:
# Create Dictionary of DataFrames
dfs = {}
for state in pred_state_dicts:
    dfs.update({state: pd.DataFrame(pred_state_dicts[state])})

# Append new DataFrames to previous states_df
for state in dfs:
    pred_state_df = pred_state_df.append(dfs[state],ignore_index=True)


pred_state_df = pred_state_df.sort_values(by=["State","Year"])
pred_state_df.head()

In [None]:
X= pred_state_df.loc[states_df['State'] == "Wisconsin"]

# Plot the data
plt.scatter(X['Year'],X['Total Consumed (Bil. Btu)'])

In [None]:
pred_statess_df = pred_states_df[['State','Year','Prod. Renewable (Bil. Btu)','Pop. (1000s)','Energy Pr. ($/mil. btu)',
                       'Total Consumed (Bil. Btu)','Difference']]

In [None]:
states_df.to_csv('"Resources/&&&&&&&&&.csv"', float_format='%.1f')