# Capstone 3: Forecasting Solar Power Using LTSM   
### _"Exploring ML Techniques for Solar Predictions"_
Audrey Malloy

Date Updated: April 22nd, 2025

**Project Goal:** Be able to identify the most effective short-term solar power forecasting model using time series forecasting and deep learning techniques such as LTSM and XGBoost. 

### Objectives for Data Wrangling
#### 1. Handle Missing Data  
#### 2. Data Cleaning & Transformation
#### 3. Feature Engineering  

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
os.chdir('C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-3-Solar/Data')
plant_1_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')
plant_1_generation = pd.read_csv('Plant_1_Generation_Data.csv')
plant_2_weather = pd.read_csv('Plant_2_Weather_Sensor_Data.csv')
plant_2_generation = pd.read_csv('Plant_2_Generation_Data.csv')


In [4]:
print(plant_1_weather.head())
print("------------------------------------------------------------")
print(plant_1_generation.head())

             DATE_TIME  PLANT_ID       SOURCE_KEY  AMBIENT_TEMPERATURE  \
0  2020-05-15 00:00:00   4135001  HmiyD2TTLFNqkNe            25.184316   
1  2020-05-15 00:15:00   4135001  HmiyD2TTLFNqkNe            25.084589   
2  2020-05-15 00:30:00   4135001  HmiyD2TTLFNqkNe            24.935753   
3  2020-05-15 00:45:00   4135001  HmiyD2TTLFNqkNe            24.846130   
4  2020-05-15 01:00:00   4135001  HmiyD2TTLFNqkNe            24.621525   

   MODULE_TEMPERATURE  IRRADIATION  
0           22.857507          0.0  
1           22.761668          0.0  
2           22.592306          0.0  
3           22.360852          0.0  
4           22.165423          0.0  
------------------------------------------------------------
          DATE_TIME  PLANT_ID       SOURCE_KEY  DC_POWER  AC_POWER  \
0  15-05-2020 00:00   4135001  1BY6WEcLGh8j5v7       0.0       0.0   
1  15-05-2020 00:00   4135001  1IF53ai7Xc0U56Y       0.0       0.0   
2  15-05-2020 00:00   4135001  3PZuoBAID5Wc2HD       0.0      

In [5]:
print(plant_2_weather.head())
print("------------------------------------------------------------")
print(plant_2_generation.head())

             DATE_TIME  PLANT_ID       SOURCE_KEY  AMBIENT_TEMPERATURE  \
0  2020-05-15 00:00:00   4136001  iq8k7ZNt4Mwm3w0            27.004764   
1  2020-05-15 00:15:00   4136001  iq8k7ZNt4Mwm3w0            26.880811   
2  2020-05-15 00:30:00   4136001  iq8k7ZNt4Mwm3w0            26.682055   
3  2020-05-15 00:45:00   4136001  iq8k7ZNt4Mwm3w0            26.500589   
4  2020-05-15 01:00:00   4136001  iq8k7ZNt4Mwm3w0            26.596148   

   MODULE_TEMPERATURE  IRRADIATION  
0           25.060789          0.0  
1           24.421869          0.0  
2           24.427290          0.0  
3           24.420678          0.0  
4           25.088210          0.0  
------------------------------------------------------------
             DATE_TIME  PLANT_ID       SOURCE_KEY  DC_POWER  AC_POWER  \
0  2020-05-15 00:00:00   4136001  4UPUqMRk7TRMgml       0.0       0.0   
1  2020-05-15 00:00:00   4136001  81aHJ1q11NBPMrL       0.0       0.0   
2  2020-05-15 00:00:00   4136001  9kRcWv60rDACzjR    

## Missing data

In [7]:
datasets = {
    "Plant 1 Weather": plant_1_weather,
    "Plant 2 Weather": plant_2_weather,
    "Plant 1 Generation": plant_1_generation,
    "Plant 2 Generation": plant_2_generation
}

for name, df in datasets.items():
    print(f"\nMissing values in {name}:")
    print(df.isnull().sum())



Missing values in Plant 1 Weather:
DATE_TIME              0
PLANT_ID               0
SOURCE_KEY             0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
dtype: int64

Missing values in Plant 2 Weather:
DATE_TIME              0
PLANT_ID               0
SOURCE_KEY             0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
dtype: int64

Missing values in Plant 1 Generation:
DATE_TIME      0
PLANT_ID       0
SOURCE_KEY     0
DC_POWER       0
AC_POWER       0
DAILY_YIELD    0
TOTAL_YIELD    0
dtype: int64

Missing values in Plant 2 Generation:
DATE_TIME      0
PLANT_ID       0
SOURCE_KEY     0
DC_POWER       0
AC_POWER       0
DAILY_YIELD    0
TOTAL_YIELD    0
dtype: int64


In [8]:
print(plant_1_weather.dtypes)

DATE_TIME               object
PLANT_ID                 int64
SOURCE_KEY              object
AMBIENT_TEMPERATURE    float64
MODULE_TEMPERATURE     float64
IRRADIATION            float64
dtype: object


In [9]:
plant_1_generation.head()

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,15-05-2020 00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0
1,15-05-2020 00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0
2,15-05-2020 00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0
3,15-05-2020 00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7602960.0
4,15-05-2020 00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.0,7158964.0


## Data Cleaning & Transformation

In [None]:
# Converting DATE_TIME to datetime 

In [11]:
for df in [plant_1_weather, plant_2_generation, plant_2_weather]:
    df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [12]:
plant_2_generation.head()

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,2020-05-15,4136001,4UPUqMRk7TRMgml,0.0,0.0,9425.0,2429011.0
1,2020-05-15,4136001,81aHJ1q11NBPMrL,0.0,0.0,0.0,1215279000.0
2,2020-05-15,4136001,9kRcWv60rDACzjR,0.0,0.0,3075.333333,2247720000.0
3,2020-05-15,4136001,Et9kgGMDl729KT4,0.0,0.0,269.933333,1704250.0
4,2020-05-15,4136001,IQ2d7wF4YD8zU1Q,0.0,0.0,3177.0,19941530.0


In [13]:
print(plant_1_generation['DATE_TIME'].head())

0    15-05-2020 00:00
1    15-05-2020 00:00
2    15-05-2020 00:00
3    15-05-2020 00:00
4    15-05-2020 00:00
Name: DATE_TIME, dtype: object


In [14]:
plant_1_generation['DATE_TIME'] = pd.to_datetime(plant_1_generation['DATE_TIME'], format='%d-%m-%Y %H:%M', errors='coerce')

In [15]:
print(plant_1_generation.head())
print()
print(plant_2_generation.head())

   DATE_TIME  PLANT_ID       SOURCE_KEY  DC_POWER  AC_POWER  DAILY_YIELD  \
0 2020-05-15   4135001  1BY6WEcLGh8j5v7       0.0       0.0          0.0   
1 2020-05-15   4135001  1IF53ai7Xc0U56Y       0.0       0.0          0.0   
2 2020-05-15   4135001  3PZuoBAID5Wc2HD       0.0       0.0          0.0   
3 2020-05-15   4135001  7JYdWkrLSPkdwr4       0.0       0.0          0.0   
4 2020-05-15   4135001  McdE0feGgRqW7Ca       0.0       0.0          0.0   

   TOTAL_YIELD  
0    6259559.0  
1    6183645.0  
2    6987759.0  
3    7602960.0  
4    7158964.0  

   DATE_TIME  PLANT_ID       SOURCE_KEY  DC_POWER  AC_POWER  DAILY_YIELD  \
0 2020-05-15   4136001  4UPUqMRk7TRMgml       0.0       0.0  9425.000000   
1 2020-05-15   4136001  81aHJ1q11NBPMrL       0.0       0.0     0.000000   
2 2020-05-15   4136001  9kRcWv60rDACzjR       0.0       0.0  3075.333333   
3 2020-05-15   4136001  Et9kgGMDl729KT4       0.0       0.0   269.933333   
4 2020-05-15   4136001  IQ2d7wF4YD8zU1Q       0.0       0.0 

In [16]:
plant_1_generation.dtypes

DATE_TIME      datetime64[ns]
PLANT_ID                int64
SOURCE_KEY             object
DC_POWER              float64
AC_POWER              float64
DAILY_YIELD           float64
TOTAL_YIELD           float64
dtype: object

In [17]:
plant_1_generation.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
DATE_TIME,68778.0,,,,2020-06-01 08:02:49.458256896,2020-05-15 00:00:00,2020-05-24 00:45:00,2020-06-01 14:30:00,2020-06-09 20:00:00,2020-06-17 23:45:00,
PLANT_ID,68778.0,,,,4135001.0,4135001.0,4135001.0,4135001.0,4135001.0,4135001.0,0.0
SOURCE_KEY,68778.0,22.0,bvBOhCH3iADSZry,3155.0,,,,,,,
DC_POWER,68778.0,,,,3147.426211,0.0,0.0,429.0,6366.964286,14471.125,4036.457169
AC_POWER,68778.0,,,,307.802752,0.0,0.0,41.49375,623.61875,1410.95,394.396439
DAILY_YIELD,68778.0,,,,3295.968737,0.0,0.0,2658.714286,6274.0,9163.0,3145.178309
TOTAL_YIELD,68778.0,,,,6978711.760671,6183645.0,6512002.53575,7146685.0,7268705.90625,7846821.0,416271.982856


In [18]:
plant_1_weather.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
DATE_TIME,3182.0,,,,2020-06-01 05:52:22.080452608,2020-05-15 00:00:00,2020-05-23 22:48:45,2020-06-01 09:52:30,2020-06-09 16:56:15,2020-06-17 23:45:00,
PLANT_ID,3182.0,,,,4135001.0,4135001.0,4135001.0,4135001.0,4135001.0,4135001.0,0.0
SOURCE_KEY,3182.0,1.0,HmiyD2TTLFNqkNe,3182.0,,,,,,,
AMBIENT_TEMPERATURE,3182.0,,,,25.531606,20.398505,22.705182,24.613814,27.920532,35.252486,3.354856
MODULE_TEMPERATURE,3182.0,,,,31.091015,18.140415,21.090553,24.61806,41.30784,65.545714,12.261222
IRRADIATION,3182.0,,,,0.228313,0.0,0.0,0.024653,0.449588,1.221652,0.300836


In [19]:
plant_2_generation.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
DATE_TIME,67698.0,,,,2020-06-01 10:44:33.650625024,2020-05-15 00:00:00,2020-05-23 21:00:00,2020-06-01 23:00:00,2020-06-09 23:30:00,2020-06-17 23:45:00,
PLANT_ID,67698.0,,,,4136001.0,4136001.0,4136001.0,4136001.0,4136001.0,4136001.0,0.0
SOURCE_KEY,67698.0,22.0,xoJJ8DcxJEcupym,3259.0,,,,,,,
DC_POWER,67698.0,,,,246.701961,0.0,0.0,0.0,446.591667,1420.933333,370.569597
AC_POWER,67698.0,,,,241.277825,0.0,0.0,0.0,438.215,1385.42,362.112118
DAILY_YIELD,67698.0,,,,3294.890295,0.0,272.75,2911.0,5534.0,9873.0,2919.448386
TOTAL_YIELD,67698.0,,,,658944788.423766,0.0,19964944.866667,282627587.0,1348495113.0,2247916295.0,729667771.073221


In [20]:
plant_2_weather.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
DATE_TIME,3259.0,,,,2020-06-01 00:04:35.053697536,2020-05-15 00:00:00,2020-05-23 12:07:30,2020-06-01 00:00:00,2020-06-09 12:07:30,2020-06-17 23:45:00,
PLANT_ID,3259.0,,,,4136001.0,4136001.0,4136001.0,4136001.0,4136001.0,4136001.0,0.0
SOURCE_KEY,3259.0,1.0,iq8k7ZNt4Mwm3w0,3259.0,,,,,,,
AMBIENT_TEMPERATURE,3259.0,,,,28.0694,20.942385,24.602135,26.981263,31.056757,39.181638,4.061556
MODULE_TEMPERATURE,3259.0,,,,32.772408,20.265123,23.716881,27.534606,40.480653,66.635953,11.344034
IRRADIATION,3259.0,,,,0.232737,0.0,0.0,0.01904,0.438717,1.098766,0.312693


In [21]:
# Change PLANT_ID to 1 or 2 
plant_1_weather['PLANT_ID'] = 1
plant_1_generation['PLANT_ID'] = 1
plant_2_weather['PLANT_ID'] = 2
plant_2_generation['PLANT_ID'] = 2

In [22]:
plant_1_weather = plant_1_weather.drop(['SOURCE_KEY'], axis=1)
plant_2_weather = plant_2_weather.drop(['SOURCE_KEY'], axis=1)

## Merging datasets

In [24]:
df_plant1 = pd.merge(plant_1_weather, plant_1_generation, on=['PLANT_ID', 'DATE_TIME'], how='inner')

print(df_plant1.head())

   DATE_TIME  PLANT_ID  AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  \
0 2020-05-15         1            25.184316           22.857507          0.0   
1 2020-05-15         1            25.184316           22.857507          0.0   
2 2020-05-15         1            25.184316           22.857507          0.0   
3 2020-05-15         1            25.184316           22.857507          0.0   
4 2020-05-15         1            25.184316           22.857507          0.0   

        SOURCE_KEY  DC_POWER  AC_POWER  DAILY_YIELD  TOTAL_YIELD  
0  1BY6WEcLGh8j5v7       0.0       0.0          0.0    6259559.0  
1  1IF53ai7Xc0U56Y       0.0       0.0          0.0    6183645.0  
2  3PZuoBAID5Wc2HD       0.0       0.0          0.0    6987759.0  
3  7JYdWkrLSPkdwr4       0.0       0.0          0.0    7602960.0  
4  McdE0feGgRqW7Ca       0.0       0.0          0.0    7158964.0  


In [27]:
df_plant2 = pd.merge(plant_2_weather, plant_2_generation, on=['PLANT_ID', 'DATE_TIME'], how='inner')

print(df_plant2.head())

   DATE_TIME  PLANT_ID  AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  \
0 2020-05-15         2            27.004764           25.060789          0.0   
1 2020-05-15         2            27.004764           25.060789          0.0   
2 2020-05-15         2            27.004764           25.060789          0.0   
3 2020-05-15         2            27.004764           25.060789          0.0   
4 2020-05-15         2            27.004764           25.060789          0.0   

        SOURCE_KEY  DC_POWER  AC_POWER  DAILY_YIELD   TOTAL_YIELD  
0  4UPUqMRk7TRMgml       0.0       0.0  9425.000000  2.429011e+06  
1  81aHJ1q11NBPMrL       0.0       0.0     0.000000  1.215279e+09  
2  9kRcWv60rDACzjR       0.0       0.0  3075.333333  2.247720e+09  
3  Et9kgGMDl729KT4       0.0       0.0   269.933333  1.704250e+06  
4  IQ2d7wF4YD8zU1Q       0.0       0.0  3177.000000  1.994153e+07  


In [34]:
df_plant1['SOURCE_KEY'].value_counts()

SOURCE_KEY
bvBOhCH3iADSZry    3155
1BY6WEcLGh8j5v7    3154
7JYdWkrLSPkdwr4    3133
VHMLBKoKgIrUVDU    3133
ZnxXDlPa8U1GXgE    3130
ih0vzX44oOqAx2f    3130
z9Y9gH1T5YWrNuG    3125
wCURE6d3bPkepu2    3125
uHbuxQJl8lW7ozc    3125
pkci93gMrogZuBj    3125
iCRJl6heRkivqQ3    3125
rGa61gmuvPhdLxV    3124
sjndEbLyjtCKgGv    3124
McdE0feGgRqW7Ca    3124
zVJPv84UY57bAof    3124
ZoEaEvLYb1n2sOq    3123
zBIq5rxdHJRwDNY    3119
1IF53ai7Xc0U56Y    3118
adLQvlD726eNBSB    3118
WRmjgnKYAwPKWDb    3118
3PZuoBAID5Wc2HD    3118
YxYtjZvoooNbGkE    3104
Name: count, dtype: int64

In [28]:
df_plant2['SOURCE_KEY'].value_counts()

SOURCE_KEY
xoJJ8DcxJEcupym    3259
WcxssY2VbP4hApt    3259
9kRcWv60rDACzjR    3259
vOuJvMaM2sgwLmb    3259
rrq4fwE8jgrTyWY    3259
LYwnQax7tkwH5Cb    3259
LlT2YUhhzqhg5Sw    3259
q49J1IKaHRwDQnt    3259
oZZkBaNadn6DNKz    3259
PeE6FRyGXUgsRhN    3259
81aHJ1q11NBPMrL    3259
V94E5Ben1TlhnDV    3259
oZ35aAeoifZaQzV    3195
4UPUqMRk7TRMgml    3195
Qf4GUc1pJu5T6c6    3195
Mx2yZCDsyf6DPfv    3195
Et9kgGMDl729KT4    3195
Quc1TzYxW2pYoWX    3195
mqwcsP2rE7J0TFp    2355
NgDl19wMapZy17u    2355
IQ2d7wF4YD8zU1Q    2355
xMbIugepa2P7lBB    2355
Name: count, dtype: int64

In [29]:
print(df_plant2.head())

   DATE_TIME  PLANT_ID  AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  \
0 2020-05-15         2            27.004764           25.060789          0.0   
1 2020-05-15         2            27.004764           25.060789          0.0   
2 2020-05-15         2            27.004764           25.060789          0.0   
3 2020-05-15         2            27.004764           25.060789          0.0   
4 2020-05-15         2            27.004764           25.060789          0.0   

        SOURCE_KEY  DC_POWER  AC_POWER  DAILY_YIELD   TOTAL_YIELD  
0  4UPUqMRk7TRMgml       0.0       0.0  9425.000000  2.429011e+06  
1  81aHJ1q11NBPMrL       0.0       0.0     0.000000  1.215279e+09  
2  9kRcWv60rDACzjR       0.0       0.0  3075.333333  2.247720e+09  
3  Et9kgGMDl729KT4       0.0       0.0   269.933333  1.704250e+06  
4  IQ2d7wF4YD8zU1Q       0.0       0.0  3177.000000  1.994153e+07  


In [30]:
print(df_plant1_cleaned.head())

             DATE_TIME  PLANT_ID  AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  \
0  2020-05-15 00:00:00         1            25.184316           22.857507   
21 2020-05-15 00:15:00         1            25.084589           22.761668   
42 2020-05-15 00:30:00         1            24.935753           22.592306   
63 2020-05-15 00:45:00         1            24.846130           22.360852   
84 2020-05-15 01:00:00         1            24.621525           22.165423   

    IRRADIATION       SOURCE_KEY  DC_POWER  AC_POWER  DAILY_YIELD  TOTAL_YIELD  
0           0.0  1BY6WEcLGh8j5v7       0.0       0.0          0.0    6259559.0  
21          0.0  1BY6WEcLGh8j5v7       0.0       0.0          0.0    6259559.0  
42          0.0  1BY6WEcLGh8j5v7       0.0       0.0          0.0    6259559.0  
63          0.0  1BY6WEcLGh8j5v7       0.0       0.0          0.0    6259559.0  
84          0.0  1BY6WEcLGh8j5v7       0.0       0.0          0.0    6259559.0  


In [36]:
df_combined = pd.concat([df_plant1, df_plant2], axis=0)

## Feature Engineering 


In [74]:
# Extract date, time, and hour
df_combined['DATE'] = df_combined['DATE_TIME'].dt.date
df_combined['TIME'] = df_combined['DATE_TIME'].dt.time
df_combined['HOUR'] = df_combined['DATE_TIME'].dt.hour

In [54]:
# Calculate cumulative yield per plant
df_combined['Cumulative_Yield'] = df_combined.groupby('PLANT_ID')['TOTAL_YIELD'].cumsum()

# Normalize yield across plants
df_combined['Normalized_Yield'] = df_combined['DAILY_YIELD'] / df_combined.groupby('PLANT_ID')['DAILY_YIELD'].transform('max')

In [58]:
df_combined['Inverter_Efficiency'] = (df_combined['AC_POWER'] / df_combined['DC_POWER']).replace([np.inf, -np.inf], 0).fillna(0)
# measures how well the inverter converts DC power to AC power

In [56]:
df_combined['Plant_Efficiency'] = df_combined.groupby('PLANT_ID')['Inverter_Efficiency'].transform('mean')

In [76]:
df_combined.head()

Unnamed: 0,DATE_TIME,PLANT_ID,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,Inverter_Efficiency,Cumulative_Yield,Normalized_Yield,Plant_Efficiency,DATE,TIME,HOUR
0,2020-05-15,1,25.184316,22.857507,0.0,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,0.0,6259559.0,0.0,0.052321,2020-05-15,00:00:00,0
1,2020-05-15,1,25.184316,22.857507,0.0,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0,0.0,12443204.0,0.0,0.052321,2020-05-15,00:00:00,0
2,2020-05-15,1,25.184316,22.857507,0.0,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0,0.0,19430963.0,0.0,0.052321,2020-05-15,00:00:00,0
3,2020-05-15,1,25.184316,22.857507,0.0,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7602960.0,0.0,27033923.0,0.0,0.052321,2020-05-15,00:00:00,0
4,2020-05-15,1,25.184316,22.857507,0.0,McdE0feGgRqW7Ca,0.0,0.0,0.0,7158964.0,0.0,34192887.0,0.0,0.052321,2020-05-15,00:00:00,0


In [78]:
print(df_combined.info())

<class 'pandas.core.frame.DataFrame'>
Index: 136472 entries, 0 to 67697
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   DATE_TIME            136472 non-null  datetime64[ns]
 1   PLANT_ID             136472 non-null  int64         
 2   AMBIENT_TEMPERATURE  136472 non-null  float64       
 3   MODULE_TEMPERATURE   136472 non-null  float64       
 4   IRRADIATION          136472 non-null  float64       
 5   SOURCE_KEY           136472 non-null  object        
 6   DC_POWER             136472 non-null  float64       
 7   AC_POWER             136472 non-null  float64       
 8   DAILY_YIELD          136472 non-null  float64       
 9   TOTAL_YIELD          136472 non-null  float64       
 10  Inverter_Efficiency  136472 non-null  float64       
 11  Cumulative_Yield     136472 non-null  float64       
 12  Normalized_Yield     136472 non-null  float64       
 13  Plant_Efficiency    

In [80]:
df_combined.isnull().sum()

DATE_TIME              0
PLANT_ID               0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
SOURCE_KEY             0
DC_POWER               0
AC_POWER               0
DAILY_YIELD            0
TOTAL_YIELD            0
Inverter_Efficiency    0
Cumulative_Yield       0
Normalized_Yield       0
Plant_Efficiency       0
DATE                   0
TIME                   0
HOUR                   0
dtype: int64

In [82]:
df_combined.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
DATE_TIME,136472.0,,,,2020-06-01 09:22:57.605662464,2020-05-15 00:00:00,2020-05-23 23:00:00,2020-06-01 18:45:00,2020-06-09 21:45:00,2020-06-17 23:45:00,
PLANT_ID,136472.0,,,,1.496058,1.0,1.0,1.0,2.0,2.0,0.499986
AMBIENT_TEMPERATURE,136472.0,,,,26.763066,20.398505,23.637604,25.908122,29.266583,39.181638,3.89734
MODULE_TEMPERATURE,136472.0,,,,31.920744,18.140415,22.411698,26.413755,40.778583,66.635953,11.803674
IRRADIATION,136472.0,,,,0.230767,0.0,0.0,0.026213,0.442961,1.221652,0.305652
SOURCE_KEY,136472.0,44.0,xoJJ8DcxJEcupym,3259.0,,,,,,,
DC_POWER,136472.0,,,,1708.373962,0.0,0.0,5.993333,1155.595,14471.125,3222.079306
AC_POWER,136472.0,,,,274.790259,0.0,0.0,3.493095,532.568571,1410.95,380.180214
DAILY_YIELD,136472.0,,,,3295.366192,0.0,28.285714,2834.642857,5992.0,9873.0,3035.313217
TOTAL_YIELD,136472.0,,,,330391576.972408,0.0,6520020.0,7269333.0,282609586.0,2247916295.0,608576923.107735


In [84]:
datapath = 'C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-3-Solar/Data'

df_cleaned_file = 'solar_cleaned.csv'

# Save Plant 1 Generation data
df_cleaned_path = os.path.join(datapath, df_cleaned_file)
df_combined.to_csv(df_cleaned_path, index=False)
print(f" data saved successfully to '{df_cleaned_path}'")



 data saved successfully to 'C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-3-Solar/Data\solar_cleaned.csv'
