## Step 1: Read and Combine Data
1. **Read Files**: Load the data from each file in the `data` folder.
2. **Combine Data**: Concatenate all the data files into a single DataFrame or CSV file for easier analysis.


In [None]:
import pandas as pd
import os
import glob

# Define the folder containing the data files
folder_path = "C:/Users/HP/Desktop/Research Internship/Data2"

# Initialize a n empty list to hold DataFrames
dataframes = []

# Rename .csv to .CSV (if needed)
for file in glob.glob(f"{folder_path}/*.csv"):
    new_name = file[:-4] + ".CSV"
    os.rename(file, new_name)
    print(f"Renamed: {file} -> {new_name}")

# Iterate over all files in the data folder
for file_name in os.listdir(folder_path):
    print(f"Processing file: {file_name}")
    if file_name.endswith('.CSV'):  # Ensure it's a CSV file
        file_path = os.path.join(folder_path, file_name)
      
        # Extract the date from the file name (assuming ddmmyy is part of the name)
        date_part = file_name[:6]
        date = pd.to_datetime(date_part, format='%d%m%y').date()
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path, on_bad_lines='skip')
        # Drop rows where ALL values are missing
        df.dropna(how='all', inplace=True)
        # Add a new column for the date
        df['Date'] = date
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Combine all DataFrames into a single DataFrame
data = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame
data.to_csv("C:/Users/HP/Desktop/Research Internship/combined_data.csv", index=False)
print("Data combined and saved as 'combined_data.csv'")

# Step 2: Explore and Preprocess Data
1. **Explore Data**: Analyze the combined data to understand its structure, distribution, and any patterns.
2. **Preprocess Data**: Clean and preprocess the data as needed (e.g., handle missing values, normalize features).
## 2.1 Data Analysis for Photovoltaic System

### 1. Attributes

- **Time**: The time at which the measurements were recorded (format: HH:MM).
- **Irradiance (W/m²)**: The power of solar radiation per unit area, measured in watts per square meter.
- **T1 to T8 (°C)**: Temperatures recorded at different points or sensors, measured in degrees Celsius.
- **TA (°C)**: Ambient temperature, representing the surrounding environmental temperature.
- **Humidity (%)**: The relative humidity of the air, given as a percentage.
- **V1 to V8 (V)**: Voltage readings from different points or sensors, measured in volts.
- **PV_Current (A)**: The current generated by the photovoltaic (PV) system, measured in amperes.
- **AC Voltage (V)**: The alternating current (AC) voltage, measured in volts.
- **AC Current (A)**: The alternating current (AC) current, measured in amperes.
- **AC Power (W)**: The power output of the AC system, measured in watts.

### 2. Check for Data Validity and Units

We'll check if the time values and other data entries are correct and ensure that each attribute has the proper unit label in the dataset.

### 3. Analyze Relationships Between Attributes

To explore how each attribute relates to others, we can use graphs to visualize these relationships. We will look at correlations such as:
- **Time vs. Irradiance**: To see if there's a pattern of solar radiation over time.
- **Irradiance vs. PV Current**: To understand how irradiance affects the current generated.
- **Temperature vs. Voltage**: To observe any dependencies of voltage on temperature.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the combined data
data = pd.read_csv('combined_data.csv')

# Check for missing values and correct data types
print("Data Info:")
print(data.info())
data.dropna(how='all', inplace=True)

# Ensure the Time column is in the correct datetime format
try:
    data['Time'] = pd.to_datetime(data['Time'], format='%H:%M', errors='coerce').dt.time
except Exception as e:
    print(f"Error in time conversion: {e}")

# Rename columns with appropriate units
columns_with_units = {
    'Irridance': 'Irradiance (W/m²)',
    'TA': 'TA (°C)',
    'Humidity': 'Humidity (%)',
    'PV_Current': 'PV_Current (A)',
    'AC_Voltage': 'AC Voltage (V)',
    'AC Current': 'AC Current (A)',
    'AC Power': 'AC Power (W)',
}
for i in range(1, 9):
    columns_with_units[f'T{i}'] = f'T{i} (°C)'
    columns_with_units[f'V{i}'] = f'V{i} (V)'

data.rename(columns=columns_with_units, inplace=True)

# Print the first few rows to verify
print("Data Sample:")
print(data.head())

# Define the column multipliers
multipliers = {
    "Irradiance (W/m²)": 1,  # No change
    "T1 (°C)": 0.1,
    "T2 (°C)": 0.1,
    "T3 (°C)": 0.1,
    "T4 (°C)": 0.1,
    "T5 (°C)": 0.1,
    "T6 (°C)": 0.1,
    "T7 (°C)": 0.1,
    "T8 (°C)": 0.1,
    "TA (°C)": 0.1,
    "Humidity (%)":0.1,
    "V1 (V)": 0.01,
    "V2 (V)": 0.01,
    "V3 (V)": 0.01,
    "V4 (V)": 0.01,
    "V5 (V)": 0.01,
    "V6 (V)": 0.01,
    "V7 (V)": 0.01,
    "V8 (V)": 0.01,
    "PV_Current (A)": 0.01,
    "AC Voltage (V)": 0.01,
    "AC Current (A)": 0.001,
    "AC Power (W)": 0.1,  
}

# Apply adjustments to data
for column, multiplier in multipliers.items():
    if column in data.columns:
        data[column] *= multiplier
print(data.head())     


# Save the adjusted data to a new file
adjusted_output_file = "adjusted_combined_data.csv"
data.to_csv(adjusted_output_file, index=False)
print(f"Adjusted data saved to {adjusted_output_file}")


# **Step 4: Load and Explore the Dataset**
## Run this in Python to check data quality:

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("adjusted_combined_data.csv")

# Display first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Check basic statistics
print(df.describe())
print("Columns in DataFrame:", data.columns.tolist())

# **Plotting Environmental Parameters vs AC Power**

In [None]:
# Irradiance Vs AC Power
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.scatter(data['Irradiance (W/m²)'], data['AC Power (W)'], alpha=0.5)
plt.xlabel('Irradiance (W/m²)')
plt.ylabel('AC Power (W)')
plt.title('Irradiance vs AC Power')
plt.xlim(0, 800)    # 🔹 Set x-axis limit
plt.ylim(0, 2690)   # 🔹 Set y-axis limit
plt.grid(True)
plt.show()

# Humidity Vs AC Power
import matplotlib.pyplot as plt
import numpy as np
# Clean the data by removing extreme values
df_clean = df[(df["Humidity (%)"] >= 0) & (df["Humidity (%)"] <= 100) &
              (df["AC Power (W)"] >= 0) & (df["AC Power (W)"] <= 2000)]

# Extract the clean columns for plotting
humidity = df_clean["Humidity (%)"]
energy = df_clean["AC Power (W)"]

# Scatter plot
plt.scatter(humidity, energy, color='red')

# Set labels and title
plt.xlabel("Relative Humidity (%)")
plt.ylabel("AC Power (W)")
plt.title("Energy vs RH")

# Show the plot
plt.show()

#Temperature Vs AC Power

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Clean the data by removing extreme values
df_clean = df[(df["TA (°C)"] >= 0) & (df["TA (°C)"] <= 50) &  
              (df["AC Power (W)"] >= 0) & (df["AC Power (W)"] <= 2000)]

# Extract the clean columns for plotting
temperature = df_clean["TA (°C)"]
energy = df_clean["AC Power (W)"]

# Scatter plot
plt.scatter(temperature, energy, color='green')

# Fit a linear trend line (line of best fit)
z = np.polyfit(temperature, energy, 1)
p = np.poly1d(z)

# Plot the trend line
plt.plot(temperature, p(temperature), color='blue')

# Set labels and title
plt.xlabel("Ambient Temperature (°C)")
plt.ylabel("AC Power (W)")
plt.title("Energy vs TA")

# Show the plot
plt.show()


# **Classification Preprocessing** #

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = r"C:\Users\HP\Desktop\Research Internship\labelled_adjusted_combined_data.csv"
df = pd.read_csv(file_path)

# Check the first few rows to confirm successful loading
print(df.head())

# Count occurrences of each class (assuming the label column is named 'Label')
class_counts = df['Status'].value_counts()

# Print proportions
print("Class Distribution:")
print(class_counts)
print("\nProportion of Normal State:", class_counts.get(0, 0) / len(df) * 100, "%")
print("Proportion of Faulty Condition:", class_counts.get(1, 0) / len(df) * 100, "%")

# Plot a bar chart
plt.figure(figsize=(6, 4))
class_counts.plot(kind='bar', color=['green', 'red'])
plt.xlabel("Condition")
plt.ylabel("Count")
plt.title("Distribution of Normal vs Faulty Conditions")
plt.xticks(ticks=[0, 1], labels=["Normal", "Faulty"], rotation=0)
plt.show()
