In [5]:
%pip install seaborn
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

data = pd.read_csv('colorado90.csv')

# Check for missing values
if data.isnull().sum().any():
    #none of these work in jupyter
    #data = data.ffill()
    #data = data.fillna(data.mean())
    #data = data.fillna(data.mode().iloc[0])
    data = data.fillna(0)

# Split data into features (X) and target (y)
X = data.drop('KWH_TOTAL', axis=1)
y = data['KWH_TOTAL']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 77)

# Using RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state = 77)

# Load training data
model.fit(X_train, y_train)

# Get the feature importance values
importances = model.feature_importances_

# Use model to make predictions on the test data
y_pred = model.predict(X_test)

# Calculate mean absolute error
mae = mean_absolute_error(y_test, y_pred)

# Calculate the R-squared score of the model
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R2 score): ", r2)

#sort features by importance, in descending order
features = X.columns
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

#print top 13 features in order of importance
print("Top 13 features:")
for f in range(13):
    print(f"{f + 1}. {features[indices[f]]}: {importances[indices[f]]}")


import matplotlib.pyplot as plt

# histogram

plt.hist(data['KWH_TOTAL'], bins=30, edgecolor='black')
plt.title('Histogram of KWH_TOTAL')
plt.xlabel('KWH_TOTAL')
plt.ylabel('Frequency')
#plt.show()
plt.savefig('histogram.png')


#regression line

# Use the model to make predictions on the test data
y_pred = model.predict(X_test)
# Create a scatter plot of the actual vs predicted values
plt.scatter(y_test, y_pred, color='blue')
# Calculate the line of best fit
m, b = np.polyfit(y_test, y_pred, 1)
# Add the line of best fit to the plot
plt.plot(y_test, m*y_test + b, color='red')

plt.title('Actual vs Predicted Values for KWH_TOTAL')
plt.xlabel('Actual KWH')
plt.ylabel('Predicted KWH')
plt.savefig('regressionLine.png')
#plt.show()

# correlation matrix
import seaborn as sns

# Get the top 13 features
top_features = [features[i] for i in indices[:13]]
# Select the top 10 features from the data
data_top_features = data[top_features]
# Calculate the correlation matrix
corr_matrix_top = data_top_features.corr()
# Create a heatmap of the correlation matrix
sns.heatmap(corr_matrix_top, annot=True, fmt=".2f")
# Display the plot
plt.savefig('correlationMatrix.png')  # Save the figure before showing it
#plt.show()

Mean Absolute Error (MAE):  2014.8880021276595
R-squared (R2 score):  0.4691731297352053
Top 13 features:
1. SQFTEST: 0.11934650961193075
2. TVCOLOR: 0.09925066336394758
3. NUMSMPHONE: 0.06927957839486358
4. DWASHUSE: 0.06289255814542863
5. PLAYSTA: 0.029348318074539918
6. INTSTREAM: 0.027212879744704148
7. OVEN: 0.02715281951938421
8. NUMFREEZ: 0.01883030216801456
9. CABLESAT: 0.01710921499142525
10. NHSLDMEM: 0.016720189553098473
11. LGTOUTNITE: 0.01661842668011275
12. TOTROOMS: 0.013442475110360886
13. BEDROOMS: 0.013295707849708965
