# Texas Wind Turbine Dataset - XGBoost r2: 0.9998
This notebook simply shows XGBoost's performance on the given dataset when unnecessary features are removed and remaining data is scaled.

In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import datetime as dt
import time

%matplotlib inline

!pip install windrose
from windrose import WindroseAxes

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Load the dataset
df = pd.read_csv("../input/texas-wind-turbine-dataset-simulated/TexasTurbine.csv")
df

# Structure of the Dataset

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

We have no null values to handle. We can directly proceed to analyze the dataset in detail.

In [None]:
attributes = list(df)

scatter_matrix(df[attributes], figsize=(12, 8))
plt.show()

In [None]:
plt.figure(figsize=(17,8))
sns.scatterplot(data=df,x="Wind speed | (m/s)",y="System power generated | (kW)",)
plt.title("Wind speed and Power Relation")
plt.show()

In [None]:
ax = WindroseAxes.from_ax()
ax.bar(df['Wind direction | (deg)'], df['Wind speed | (m/s)'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()
plt.title("Wind direction | (deg) VS Wind speed | (m/s)")
plt.show()

In [None]:
df = df.set_index(df["Time stamp"])
df = df.drop("Time stamp", axis=1)
df

In [None]:
corr_matrix = df.corr()
corr_matrix["System power generated | (kW)"].sort_values(ascending=False)

In [None]:
df = df.drop(["Air temperature | ('C)","Wind direction | (deg)"], axis=1)

Air temperature | ('C) and Wind direction | (deg) have very low effect on the System power generated | (kW) according to our dataset.

# Creating a test set and cross validation on training set

In [None]:
wind_df = df.copy()

train_set, test_set = train_test_split(wind_df, test_size=0.2, random_state=42)

In [None]:
train_x = train_set.drop("System power generated | (kW)", axis=1)
train_y = train_set["System power generated | (kW)"].copy()

In [None]:
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)

In [None]:
#linear Regression

lin_reg = LinearRegression()
lin_scores = cross_val_score(lin_reg, train_x_scaled, train_y, scoring="neg_mean_squared_error", cv=5)
lin_rmse_scores = np.sqrt(-lin_scores)
lin_r2_scores = cross_val_score(lin_reg, train_x_scaled, train_y, scoring="r2", cv=5)

#Decision Tree Regression

tree_reg = DecisionTreeRegressor(random_state=42)
tree_scores = cross_val_score(tree_reg, train_x_scaled, train_y, scoring="neg_mean_squared_error", cv=5)
tree_rmse_scores = np.sqrt(-tree_scores)
tree_r2_scores = cross_val_score(tree_reg, train_x_scaled, train_y, scoring="r2", cv=5)

#XGBoost Regression

xgb_reg = XGBRegressor(random_state=42)
xgb_scores = cross_val_score(xgb_reg, train_x_scaled, train_y, scoring="neg_mean_squared_error", cv=5)
xgb_rmse_scores = np.sqrt(-xgb_scores)
xgb_r2_scores = cross_val_score(xgb_reg, train_x_scaled, train_y, scoring="r2", cv=5)

In [None]:
def display_scores(scores, r2_scores, name):
    print(f"{name}:")
    print(f"  RMSE: {scores.mean():.4f}")
    print(f"  r2: {r2_scores.mean():.4f}")

# Display scores
display_scores(lin_rmse_scores, lin_r2_scores, "Linear Regression")
display_scores(tree_rmse_scores, tree_r2_scores, "Decision Tree Regressor")
display_scores(xgb_rmse_scores, xgb_r2_scores, "XGBRegressor")

XGBoost seems promising after cross validation. At this point, we could continue with XGBoost alone. However, I will continue evaluating them on the test set as well.

# Evaluate on Test Set

In [None]:
test_x_scaled = scaler.transform(test_set.drop("System power generated | (kW)", axis=1))
test_y = test_set["System power generated | (kW)"]

In [None]:
#Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(train_x_scaled, train_y)
lin_pred = lin_reg.predict(test_x_scaled)
lin_rmse = np.sqrt(mean_squared_error(test_y, lin_pred))
lin_r2 = r2_score(test_y, lin_pred)

#Decision Tree Regression
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train_x_scaled, train_y)
tree_pred = tree_reg.predict(test_x_scaled)
tree_rmse = np.sqrt(mean_squared_error(test_y, tree_pred))
tree_r2 = r2_score(test_y, tree_pred)

#XGBoost Regression

xgb_reg = XGBRegressor(random_state=42)
xgb_reg.fit(train_x_scaled, train_y)
xgb_pred = xgb_reg.predict(test_x_scaled)
xgb_rmse = np.sqrt(mean_squared_error(test_y, xgb_pred))
xgb_r2 = r2_score(test_y, xgb_pred)

In [None]:
#Calculate 95% confidence interval for RMSE
def calculate_confidence_interval(scores):
    mean = scores.mean()
    std = scores.std()
    n = len(scores)
    margin_error = 1.96 * (std / np.sqrt(n))
    lower_bound = mean - margin_error
    upper_bound = mean + margin_error
    return lower_bound, upper_bound

lin_lower_rmse, lin_upper_rmse = calculate_confidence_interval(lin_rmse_scores)
tree_lower_rmse, tree_upper_rmse = calculate_confidence_interval(tree_rmse_scores)
xgb_lower_rmse, xgb_upper_rmse = calculate_confidence_interval(xgb_rmse_scores)

In [None]:
#Display results
print("Test Set Evaluation:")
print(f"\nLinear Regression RMSE: {lin_rmse:.4f}, 95% CI: ({lin_lower_rmse:.4f}, {lin_upper_rmse:.4f})")
print(f"\nDecision Tree Regressor RMSE: {tree_rmse:.4f}, 95% CI: ({tree_lower_rmse:.4f}, {tree_upper_rmse:.4f})")
print(f"\nXGBRegressor RMSE: {xgb_rmse:.4f}, 95% CI: ({xgb_lower_rmse:.4f}, {xgb_upper_rmse:.4f})")
print(f"\n\nLinear Regression r2: {lin_r2:.4f}")
print(f"\nDecision Tree Regressor r2: {tree_r2:.4f}")
print(f"\nXGBRegressor r2: {xgb_r2:.4f}")