<a href="https://colab.research.google.com/github/bhumikasharma01/Solar_Project_Week1/blob/main/Week_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -----------------------------------------
# Step 1: Import Libraries
# -----------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# To display all columns in Jupyter
pd.set_option("display.max_columns", None)


In [5]:
# -----------------------------------------
# Step 2: Data Collection (Load Dataset)
# -----------------------------------------
# Assuming you extracted the Kaggle Solar Power Generation Data into a "data" folder

# Power generation data (Plant 1 and Plant 2)
plant1_power = pd.read_csv("Plant_1_Generation_Data.csv")
plant2_power = pd.read_csv("Plant_2_Generation_Data.csv")

# Weather data (Plant 1 and Plant 2)
plant1_weather = pd.read_csv("Plant_1_Weather_Sensor_Data.csv")
plant2_weather = pd.read_csv("Plant_2_Weather_Sensor_Data.csv")

print("Plant 1 Power Shape:", plant1_power.shape)
print("Plant 2 Power Shape:", plant2_power.shape)
print("Plant 1 Weather Shape:", plant1_weather.shape)
print("Plant 2 Weather Shape:", plant2_weather.shape)

# Preview the data
plant1_power.head()


Plant 1 Power Shape: (68778, 7)
Plant 2 Power Shape: (67698, 7)
Plant 1 Weather Shape: (3182, 6)
Plant 2 Weather Shape: (3259, 6)


Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,15-05-2020 00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0
1,15-05-2020 00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0
2,15-05-2020 00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0
3,15-05-2020 00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7602960.0
4,15-05-2020 00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.0,7158964.0


In [16]:
# -----------------------------------------
# Step 3: DateTime Conversion
# -----------------------------------------
# Dataset datetime format is "DD-MM-YYYY HH:MM" (without seconds)
# Use dayfirst=True for safety
plant1_weather['DATE_TIME'] = pd.to_datetime(
    plant1_weather['DATE_TIME'], format="%Y-%m-%d %H:%M:%S", dayfirst=False
)

plant2_power['DATE_TIME'] = pd.to_datetime(
    plant2_power['DATE_TIME'], format="%Y-%m-%d %H:%M:%S", dayfirst=False
)

plant2_weather['DATE_TIME'] = pd.to_datetime(
    plant2_weather['DATE_TIME'], format="%Y-%m-%d %H:%M:%S", dayfirst=False
)

# Check few values to confirm
print("Plant 1 Power datetime sample:\n", plant1_power['DATE_TIME'].head())



Plant 1 Power datetime sample:
 0   2020-05-15
1   2020-05-15
2   2020-05-15
3   2020-05-15
4   2020-05-15
Name: DATE_TIME, dtype: datetime64[ns]


In [11]:
# Merge only on DATE_TIME
plant1_merged = pd.merge(
    plant1_power, plant1_weather,
    on="DATE_TIME"
)

print("Merged Plant 1 Shape:", plant1_merged.shape)
print("Sample:\n", plant1_merged.head())


Merged Plant 1 Shape: (68774, 12)
Sample:
    DATE_TIME  PLANT_ID_x     SOURCE_KEY_x  DC_POWER  AC_POWER  DAILY_YIELD  \
0 2020-05-15     4135001  1BY6WEcLGh8j5v7       0.0       0.0          0.0   
1 2020-05-15     4135001  1IF53ai7Xc0U56Y       0.0       0.0          0.0   
2 2020-05-15     4135001  3PZuoBAID5Wc2HD       0.0       0.0          0.0   
3 2020-05-15     4135001  7JYdWkrLSPkdwr4       0.0       0.0          0.0   
4 2020-05-15     4135001  McdE0feGgRqW7Ca       0.0       0.0          0.0   

   TOTAL_YIELD  PLANT_ID_y     SOURCE_KEY_y  AMBIENT_TEMPERATURE  \
0    6259559.0     4135001  HmiyD2TTLFNqkNe            25.184316   
1    6183645.0     4135001  HmiyD2TTLFNqkNe            25.184316   
2    6987759.0     4135001  HmiyD2TTLFNqkNe            25.184316   
3    7602960.0     4135001  HmiyD2TTLFNqkNe            25.184316   
4    7158964.0     4135001  HmiyD2TTLFNqkNe            25.184316   

   MODULE_TEMPERATURE  IRRADIATION  
0           22.857507          0.0  
1    

In [13]:

# Make a copy before adding new columns
data = plant1_merged[[
    "DATE_TIME", "DC_POWER", "AC_POWER", "DAILY_YIELD", "TOTAL_YIELD",
    "AMBIENT_TEMPERATURE", "MODULE_TEMPERATURE", "IRRADIATION"
]].copy()

# Add time-based features safely
data.loc[:, "hour"] = data["DATE_TIME"].dt.hour
data.loc[:, "day"] = data["DATE_TIME"].dt.day
data.loc[:, "month"] = data["DATE_TIME"].dt.month

print("Data shape after feature engineering:", data.shape)
data.head()




Data shape after feature engineering: (68774, 11)


Unnamed: 0,DATE_TIME,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,hour,day,month
0,2020-05-15,0.0,0.0,0.0,6259559.0,25.184316,22.857507,0.0,0,15,5
1,2020-05-15,0.0,0.0,0.0,6183645.0,25.184316,22.857507,0.0,0,15,5
2,2020-05-15,0.0,0.0,0.0,6987759.0,25.184316,22.857507,0.0,0,15,5
3,2020-05-15,0.0,0.0,0.0,7602960.0,25.184316,22.857507,0.0,0,15,5
4,2020-05-15,0.0,0.0,0.0,7158964.0,25.184316,22.857507,0.0,0,15,5


In [14]:
from sklearn.model_selection import train_test_split

X = data.drop(["DATE_TIME", "AC_POWER"], axis=1)
y = data["AC_POWER"]

print("X shape:", X.shape)
print("y shape:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


X shape: (68774, 9)
y shape: (68774,)
Training set size: (55019, 9)
Testing set size: (13755, 9)
