In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Data Preprocessing**

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_excel('/content/drive/MyDrive/termfrequencyidfoutputmain.xlsx')

# Now you can work with the DataFrame as needed
print(df.head())  # Print the first few rows of the DataFrame

    fitting      time     floor   setting    result   support     water  \
0  0.003381  0.005527  0.001845  0.003685  0.002859  0.001842  0.003116   
1  0.000000  0.006298  0.004203  0.004198  0.003258  0.002099  0.003551   
2  0.003806  0.006222  0.002077  0.004148  0.003219  0.002074  0.003508   
3  0.003642  0.005955  0.001987  0.003970  0.003080  0.001985  0.000000   
4  0.003426  0.005602  0.001869  0.003734  0.002898  0.001867  0.000000   

     number      test    listed  ...   coating   shading  reflective  \
0  0.003689  0.002848  0.001845  ...  0.000000  0.000000    0.000000   
1  0.002102  0.003245  0.002102  ...  0.003673  0.003673    0.003673   
2  0.004153  0.003206  0.002077  ...  0.000000  0.000000    0.000000   
3  0.003975  0.003068  0.001987  ...  0.000000  0.000000    0.000000   
4  0.003739  0.002886  0.001869  ...  0.003267  0.003267    0.003267   

    measure     local      loft    source   turbine  emission rate  \
0  0.000000  0.000000  0.000000  0.000000  0.0

In [None]:

df_imputed = df.fillna(df.mean())

# Check if missing values have been imputed
print("Number of missing values in each column after imputation:")
print(df_imputed.isnull().sum())

Number of missing values in each column after imputation:
fitting             0
time                0
floor               0
setting             0
result              0
                   ..
loft                0
source              0
turbine             0
emission rate       0
consumption rate    0
Length: 143, dtype: int64


In [None]:
df_imputed.head

<bound method NDFrame.head of        fitting      time     floor   setting    result   support     water  \
0     0.003381  0.005527  0.001845  0.003685  0.002859  0.001842  0.003116   
1     0.000000  0.006298  0.004203  0.004198  0.003258  0.002099  0.003551   
2     0.003806  0.006222  0.002077  0.004148  0.003219  0.002074  0.003508   
3     0.003642  0.005955  0.001987  0.003970  0.003080  0.001985  0.000000   
4     0.003426  0.005602  0.001869  0.003734  0.002898  0.001867  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
1180  0.000000  0.008280  0.002763  0.005520  0.000000  0.002760  0.000000   
1181  0.000000  0.008280  0.002763  0.005520  0.000000  0.002760  0.000000   
1182  0.000000  0.008280  0.002763  0.005520  0.000000  0.002760  0.000000   
1183  0.000000  0.008280  0.002763  0.005520  0.000000  0.002760  0.000000   
1184  0.000000  0.008280  0.002763  0.005520  0.000000  0.002760  0.000000   

        number      test    liste

# **Random Forest**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


# Load your dataset into a Pandas DataFrame
# Replace 'data.csv' with your actual data file
df_imputed = pd.read_excel('/content/drive/MyDrive/termfrequencyidfoutputmain.xlsx')

# Assuming 'target' is the column you want to predict
X = df_imputed.drop('consumption rate', axis=1)  # Features
y = df_imputed['consumption rate']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
# You can customize the parameters based on your data and requirements
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Root Mean Squared Error:", rmse)

Mean Squared Error: 37194.652287086494
R-squared: 0.5294557683739962
Root Mean Squared Error: 192.85915142166962


# **Random Forest with Applied HyperParameter Optimisation**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load your dataset
df_imputed = pd.read_excel('/content/drive/MyDrive/termfrequencyidfoutputmain.xlsx')

# Assuming 'consumption rate' is the column you want to predict
target_column = 'consumption rate'

# Prepare the data
X = df_imputed.drop(target_column, axis=1)
y = df_imputed[target_column]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize Random Forest Regressor
rf_regressor = RandomForestRegressor()

# Initialize Grid Search with Cross-Validation
grid_search = GridSearchCV(rf_regressor, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform Grid Search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)


print("Best Hyperparameters:", best_params)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)


  warn(


Best Hyperparameters: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Mean Squared Error: 40367.93181542719
Root Mean Squared Error: 200.91772399523938


# **Linear Regression**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you have your dataset in a CSV file named 'your_dataset.csv'
# Replace 'your_dataset.csv' with the actual file path

# Load the dataset from CSV file
df_imputed = pd.read_excel('/content/drive/MyDrive/termfrequencyidfoutputmain.xlsx')

# Define the features (input variables) and target variable
features = ['time', 'floor', 'analytics', 'high', 'change', 'consider', 'recommendation', 'low', 'medium', 'performance', 'year', 'heating', 'efficiency', 'emission rate', 'impact']
target = 'consumption rate'

# Split the data into features (X) and target (y)
X = df_imputed[features]
y = df_imputed[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
linear_reg = LinearRegression()

# Train the model
linear_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = linear_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Root Mean Squared Error:", rmse)






Mean Squared Error: 37321.90500762667
R-squared: 0.5278459123886055
Root Mean Squared Error: 193.1887807498838


# **Multilayer Processing (MLP)**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the dataset from Excel file
df_imputed = pd.read_excel('/content/drive/MyDrive/termfrequencyidfoutputmain.xlsx')

# Define the features (input variables) and target variable
features = ['time', 'floor', 'analytics', 'high', 'change', 'consider', 'recommendation', 'low', 'medium', 'performance', 'year', 'heating', 'efficiency', 'emission rate', 'impact']
target = 'consumption rate'

# Split the data into features (X) and target (y)
X = df_imputed[features]
y = df_imputed[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Multi-Layer Perceptron (MLP) Regressor
mlp_reg = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=42)

# Train the model
mlp_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = mlp_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Root Mean Squared Error:", rmse)


Mean Squared Error: 39509.37581531185
R-squared: 0.5001725317514676
Root Mean Squared Error: 198.769655167261


