# DATASET PREVIEW

In [1]:
import numpy as np
import  pandas as pd

train = pd.read_csv("C:/Users/mdutt/Desktop/TeamElytra_Solar_Eff_Prediction_Model/dataset/train.csv")
test = pd.read_csv("C:/Users/mdutt/Desktop/TeamElytra_Solar_Eff_Prediction_Model/dataset/test.csv")
sample_submission = pd.read_csv("C:/Users/mdutt/Desktop/TeamElytra_Solar_Eff_Prediction_Model/dataset/sample_submission.csv")

# print(train.head(10),"\n",test.head(10),"\n",sample_submission.head(10))

print(train.shape)
print(test.shape)
print(sample_submission.shape)

(20000, 17)
(12000, 16)
(5, 2)


In [None]:
# Check columns and types with values:
print(train.info())
print(train.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  20000 non-null  int64  
 1   temperature         18999 non-null  float64
 2   irradiance          19013 non-null  float64
 3   humidity            20000 non-null  object 
 4   panel_age           18989 non-null  float64
 5   maintenance_count   18973 non-null  float64
 6   soiling_ratio       18990 non-null  float64
 7   voltage             19007 non-null  float64
 8   current             19023 non-null  float64
 9   module_temperature  19022 non-null  float64
 10  cloud_coverage      18990 non-null  float64
 11  wind_speed          20000 non-null  object 
 12  pressure            20000 non-null  object 
 13  string_id           20000 non-null  object 
 14  error_code          14088 non-null  object 
 15  installation_type   14972 non-null  object 
 16  effi

In [4]:
# Check for missing values:
print(train.isnull().sum())
print(test.isnull().sum())


id                       0
temperature           1001
irradiance             987
humidity                 0
panel_age             1011
maintenance_count     1027
soiling_ratio         1010
voltage                993
current                977
module_temperature     978
cloud_coverage        1010
wind_speed               0
pressure                 0
string_id                0
error_code            5912
installation_type     5028
efficiency               0
dtype: int64
id                       0
temperature            582
irradiance             615
humidity                 0
panel_age              607
maintenance_count      609
soiling_ratio          610
voltage                547
current                587
module_temperature     580
cloud_coverage         582
wind_speed               0
pressure                 0
string_id                0
error_code            3611
installation_type     2979
dtype: int64


# EDA for data processing

## Numerical Data Plots

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols.drop('id')

# Plotting Histograms and Boxplot for numerical columns

# convert columns to numeric, coercing errors to NaN
train['humidity'] = pd.to_numeric(train['humidity'], errors='coerce')
train['wind_speed'] = pd.to_numeric(train['wind_speed'], errors='coerce')
train['pressure'] = pd.to_numeric(train['pressure'], errors='coerce')

test['humidity'] = pd.to_numeric(test['humidity'], errors='coerce')
test['wind_speed'] = pd.to_numeric(test['wind_speed'], errors='coerce')
test['pressure'] = pd.to_numeric(test['pressure'], errors='coerce')

for col in numerical_cols:
    plt.figure(figsize=(5,4))
    plt.subplot(1, 2, 1)
    sns.histplot(train[col], kde=True)
    plt.title(f'{col} distribution')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x=train[col], color='salmon')
    plt.title(f'Boxplot of {col}')

    plt.tight_layout()
    plt.show()


## Categorical Data Plots

In [None]:
sns.set_theme(style="whitegrid")

# Plot count plots for categorical variables
categorial_cols =['string_id','error_code' ,'installation_type']

for col in categorial_cols:
    plt.figure(figsize=(5,4))
    sns.countplot(data=train, x=col, hue =col, palette='viridis', legend= False)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()



## Handle missing values

In [None]:
# for numerical columns, fill NaN values with median
for cols in numerical_cols:
    if train[cols].isnull().sum() > 0:
        train[cols].fillna(train[cols].median(), inplace=True)
        test[cols].fillna(test[cols].median(), inplace=True)
        
# for categorical columns, fill NaN values with mode
for cols in categorial_cols:
    if train[cols].isnull().sum() > 0:
        train[cols].fillna(train[cols].mode()[0], inplace=True)
        test[cols].fillna(test[cols].mode()[0], inplace=True)

## As LightGBM do not understand categorical values we need to convert those into Numerical values, Fields :- ['string_id','error_code' ,'installation_type']

In [28]:
train = pd.get_dummies(train, columns=categorial_cols, drop_first=True)
test = pd.get_dummies(test, columns=categorial_cols, drop_first=True)

# Align train and test dataframes
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0

test = test[train.columns.drop('efficiency')]  # remove target from alignment


## Handling Outliers

In [None]:
from sklearn.preprocessing import RobustScaler

# Select numeric columns excluding 'id' and target 'efficiency'
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns.drop(['id', 'efficiency'])

# Initialize scaler
scaler = RobustScaler()

# Fit scaler on train numeric data and transform both train and test
train[numeric_cols] = scaler.fit_transform(train[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])

# Save the cleaned data
train.to_csv("C:/Users/mdutt/Desktop/TeamElytra_Solar_Eff_Prediction_Model/dataset/Clean_X_Train.csv", index=False)
test.to_csv("C:/Users/mdutt/Desktop/TeamElytra_Solar_Eff_Prediction_Model/dataset/Clean_Test_Data.csv", index=False)

## PreProcessed Data FINAL

In [34]:
X_train = train.drop(['id', 'efficiency'], axis=1)
y_train = train['efficiency']

X_test = test.drop('id', axis=1)
train_ids = train['id']
test_ids = test['id']
# Save the processed data 
y_train.to_csv("C:/Users/mdutt/Desktop/TeamElytra_Solar_Eff_Prediction_Model/dataset/Efficiency_y_train.csv", index=False)  
