In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset from CSV (you can replace the file path with your actual file path)
data = pd.read_csv('dated_solar_data.csv')

# Show the first few rows of the dataset to understand its structure
print("Original Data:")
print(data.head())

# Data Preparation

# 1. Skipping DateTime conversion since you requested to remove this step.
# If you want to work with 'DATE_TIME' in its original format, you can print the column before and after
print("\nBefore DateTime Conversion:")
print(data['DATE_TIME'].head())  # Print 'DATE_TIME' column before conversion

# 2. Check for missing values
print("\nMissing Values Before Handling:")
print(data.isnull().sum())

# 3. Handle missing values (example: filling missing values with the median of the column)
data['DC_POWER'] = data['DC_POWER'].fillna(data['DC_POWER'].median())
data['AC_POWER'] = data['AC_POWER'].fillna(data['AC_POWER'].median())
data['DAILY_YIELD'] = data['DAILY_YIELD'].fillna(data['DAILY_YIELD'].median())
data['TOTAL_YIELD'] = data['TOTAL_YIELD'].fillna(data['TOTAL_YIELD'].median())

print("\nMissing Values After Handling:")
print(data.isnull().sum())  # Check if missing values were handled

# 4. Check for duplicates
print("\nDuplicate Rows Before Removing:")
print(data.duplicated().sum())  # This will show how many duplicate rows exist

# 5. Remove duplicates if found
data = data.drop_duplicates()

print("\nDuplicate Rows After Removing:")
print(data.duplicated().sum())  # Check if duplicates were removed

# 6. Check for any obvious data inconsistencies or outliers
print("\nNegative Values Check Before Handling:")
print(data[['DC_POWER', 'AC_POWER', 'DAILY_YIELD']].lt(0).sum())

# Handle negative values by replacing them with 0 or a valid value
data[['DC_POWER', 'AC_POWER', 'DAILY_YIELD']] = data[['DC_POWER', 'AC_POWER', 'DAILY_YIELD']].apply(lambda x: x.clip(lower=0))

print("\nNegative Values Check After Handling:")
print(data[['DC_POWER', 'AC_POWER', 'DAILY_YIELD']].lt(0).sum())  # Check if negative values were handled

# 7. Convert numerical columns to appropriate types (e.g., float)
print("\nData Types Before Conversion:")
print(data.dtypes)

data['DC_POWER'] = data['DC_POWER'].astype(float)
data['AC_POWER'] = data['AC_POWER'].astype(float)
data['DAILY_YIELD'] = data['DAILY_YIELD'].astype(float)
data['TOTAL_YIELD'] = data['TOTAL_YIELD'].astype(float)

print("\nData Types After Conversion:")
print(data.dtypes)  # Check if the types were changed correctly

# Final Data Review
print("\nCleaned Data:")
print(data.head())

# Save the cleaned data to a new CSV file (optional)
data.to_csv('cleaned_data_solar1.csv', index=False)


Original Data:
  LOCATION            DATE_TIME  SENSOR_ID  DC_POWER  AC_POWER  DAILY_YIELD  \
0        A  2020-05-15 00:00:00   sensor 1       0.0       0.0          0.0   
1        A  2020-05-15 00:00:00   sensor 2       0.0       0.0          0.0   
2        A  2020-05-15 00:00:00   sensor 3       0.0       0.0          0.0   
3        A  2020-05-15 00:00:00   sensor 5       0.0       0.0          0.0   
4        A  2020-05-15 00:00:00  sensor 12       0.0       0.0          0.0   

   TOTAL_YIELD  
0    6259559.0  
1    6183645.0  
2    6987759.0  
3    7602960.0  
4    7158964.0  

Before DateTime Conversion:
0    2020-05-15 00:00:00
1    2020-05-15 00:00:00
2    2020-05-15 00:00:00
3    2020-05-15 00:00:00
4    2020-05-15 00:00:00
Name: DATE_TIME, dtype: object

Missing Values Before Handling:
LOCATION       0
DATE_TIME      0
SENSOR_ID      0
DC_POWER       0
AC_POWER       0
DAILY_YIELD    0
TOTAL_YIELD    0
dtype: int64

Missing Values After Handling:
LOCATION       0
DATE_TIME