In [None]:
# Solar Power Generation 
import pandas as pd 
import numpy as np 
df = pd.read_csv("solarpower.csv")
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Checking Missing Values
df.isnull().sum()

In [None]:
missing = df.isnull().sum()
missing_percentage = (df.isnull().mean() * 100).round(2)
missing_df = pd.DataFrame({
    "Missing Values": missing,
    "Missing %": missing_percentage
})
missing_df

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
# Data Visualization

# Univariate Analysis
# Histogram
import matplotlib.pyplot as plt
df.hist(figsize=(14, 10), bins=30)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot
import seaborn as sns
plt.figure(figsize=(14, 10))
for i, col in enumerate(df.select_dtypes(include=np.number).columns):
    plt.subplot(4, 3, i+1)
    sns.boxplot(x=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
# OUTLIER DETECTION
outlier_summary = {}

for col in df.select_dtypes(include=np.number).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower) | (df[col] > upper)][col].count()
    outlier_summary[col] = outliers

pd.DataFrame.from_dict(outlier_summary, orient='index', columns=["Outlier Count"])

In [None]:
# Skewness Analysis

df.skew().sort_values(ascending=False)

In [None]:
skewed_cols = df.skew().loc[lambda x: abs(x) > 0.75].index

for col in skewed_cols:
    df[col+"_log"] = np.log1p(df[col] - df[col].min() + 1)

df.head()

In [None]:
# Bivariate Analysis 

#Scatterplot
target = "power-generated"

for col in df.select_dtypes(include=np.number).columns:
    if col != target:
        plt.figure(figsize=(5,4))
        sns.scatterplot(x=df[col], y=df[target])
        plt.title(f"{target} vs {col}")
        plt.show()

In [None]:
# Multivaraiate Analysis

# Pairplot
numeric_df = df.select_dtypes(include='number')

plt.figure(figsize=(12, 10))
sns.pairplot(numeric_df)
plt.suptitle("Pairplot of Numeric Variables", y=1.02)
plt.show()

In [None]:
print(df.columns)

In [None]:
# Correlation & HeatMap Analysis

plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
df.corr()["power-generated"].sort_values(ascending=False)

In [None]:
# Data Scaling

from sklearn.preprocessing import StandardScaler
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[num_cols])
scaled_df = pd.DataFrame(scaled_data, columns=num_cols)
print("\nScaled Data (first 5 rows):")
scaled_df.head()

In [None]:
# VIF Multicollinearity

import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_cols = [
    col for col in df.columns
    if df[col].dtype != "O" and "_log" not in col and col != "power-generated"
]

df[vif_cols] = df[vif_cols].replace([np.inf, -np.inf], np.nan)
df[vif_cols] = df[vif_cols].fillna(df[vif_cols].median())

vif_cols = [col for col in vif_cols if df[col].nunique() > 1]

X = df[vif_cols]

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                   for i in range(X.shape[1])]

print(vif_data)

In [None]:
# ---------------------
# Model Development 
# ---------------------
# Task 1: Define Features (X) and Target (y)
# Replace with your exact target column name
target = "power-generated_log"

X = df.drop(columns=[target])
y = df[target]

In [None]:
# Task 2: Train–Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Task 3: Feature Scaling (Very Important for Regression Models)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# MODEL 1: Linear Regression
# --------------------------
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Linear Regression Model
lr = LinearRegression()

# Train
lr.fit(X_train_scaled, y_train)

# Predict
y_pred_lr = lr.predict(X_test_scaled)

# Metrics
lr_r2 = r2_score(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)

# Print Results
print("Linear Regression R2 Score      :", lr_r2)
print("Linear Regression MSE           :", lr_mse)
print("Linear Regression RMSE          :", lr_rmse)


In [None]:
# Linear Regression – Scatter Plot
plt.figure(figsize=(7,5))
plt.scatter(y_test, y_pred_lr, color="blue", s=15)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--', linewidth=2)

plt.title("Linear Regression: Actual vs Predicted")
plt.xlabel("Actual Power")
plt.ylabel("Predicted Power")
plt.tight_layout()
plt.show()

In [None]:
# MODEL 2: Decision Tree Regressor
# --------------------------------
# MODEL 2: Decision Tree Regressor
# --------------------------------
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
dt = DecisionTreeRegressor(random_state=42)

dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

dt_r2 = r2_score(y_test, y_pred_dt)
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_rmse = np.sqrt(dt_mse)
print("Decision Tree R2 Score      :", dt_r2)
print("Decision Tree MSE           :", dt_mse)
print("Decision Tree RMSE          :", dt_rmse)


In [None]:
# Decision Tree - Scatter plot
plt.figure(figsize=(7,5))
plt.scatter(y_test, y_pred_dt, color="purple", s=15)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--', linewidth=2)

plt.title("Decision Tree: Actual vs Predicted")
plt.xlabel("Actual Power")
plt.ylabel("Predicted Power")
plt.tight_layout()
plt.show()

In [None]:
# MODEL 3: Random Forest Regressor
# --------------------------------
# Train & evaluate (simple)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_r2 = r2_score(y_test, y_pred_rf)
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(rf_mse)
print("Random Forest R2 Score      :", rf_r2)
print("Random Forest MSE           :", rf_mse)
print("Random Forest RMSE          :", rf_rmse)


In [None]:
# Random Forest Regressor – Scatter Plot
plt.figure(figsize=(7,5))
plt.scatter(y_test, y_pred_rf, color="green", s=15)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--', linewidth=2)

plt.title("Random Forest: Actual vs Predicted")
plt.xlabel("Actual Power")
plt.ylabel("Predicted Power")
plt.tight_layout()
plt.show()

In [None]:
# MODEL 4: Support Vector Regressor (SVM/SVR)

from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
svr = SVR(kernel="rbf", C=1.0, epsilon=0.1)
svr.fit(X_train_scaled, y_train)
y_pred_svr = svr.predict(X_test_scaled)
svr_r2 = r2_score(y_test, y_pred_svr)
svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_rmse = np.sqrt(svr_mse)
print("SVR R2 Score      :", svr_r2)
print("SVR MSE           :", svr_mse)
print("SVR RMSE          :", svr_rmse)


In [None]:
# Support Vector Regressor (SVR) — Scatter Plot
plt.figure(figsize=(7,5))
plt.scatter(y_test, y_pred_svr, color="orange", s=18)
plt.plot([y_test.min(), y_test.max()], 
         [y_test.min(), y_test.max()], 
         'r--', linewidth=2)

plt.title("SVR: Actual vs Predicted")
plt.xlabel("Actual Power")
plt.ylabel("Predicted Power")
plt.tight_layout()
plt.show()