Random Forest Regression

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

# Load the datasets
book_i4_data = pd.read_csv('Booki4.csv')
book_i48_data = pd.read_csv('Booki48.csv')
book_1_data = pd.read_csv('Book1.csv')
book_3_data = pd.read_csv('Book3.csv')
book_5_data = pd.read_csv('Book5.csv')
testing_data = pd.read_csv('testing_set.csv')
# Replace blank spaces with NaN values
book_i4_data.replace(' ', float('nan'), inplace=True)
book_i48_data.replace(' ', float('nan'), inplace=True)
book_1_data.replace(' ', float('nan'), inplace=True)
book_3_data.replace(' ', float('nan'), inplace=True)
book_5_data.replace(' ', float('nan'), inplace=True)
testing_data.replace(' ', float('nan'), inplace=True)

# Merge the datasets
merged_data = pd.concat([book_1_data[['R5 (MOhm)', 'R6 (MOhm)', 'R7 (MOhm)', 'R8 (MOhm)', 'R13 (MOhm)', 'R14 (MOhm)']],
                        book_3_data[['R9 (MOhm)', 'R10 (MOhm)', 'R11 (MOhm)', 'R12 (MOhm)']],
                        book_5_data[['R13 (MOhm)', 'R14 (MOhm)', 'Flow rate (mL/min)', 'Heater voltage (V)']],
                        book_i4_data[['Time (s)', 'CO (ppm)', 'Humidity (%r.h.)', 'Temperature (C)']],
                        book_i48_data[['Flow rate (mL/min)', 'Heater voltage (V)', 'R1 (MOhm)', 'R2 (MOhm)', 'R3 (MOhm)', 'R4 (MOhm)']]], axis=1)

# Prepare the training and testing data
X_train = merged_data.drop("CO (ppm)", axis=1)
y_train = merged_data["CO (ppm)"]
X_test = testing_data[X_train.columns]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Remove rows with NaN values in y_train
valid_indices = ~pd.isnull(y_train)
X_train_imputed = X_train_imputed[valid_indices]
y_train = y_train[valid_indices]

# Train the model
model = RandomForestRegressor()
model.fit(X_train_imputed, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test_imputed)

# Create a DataFrame with the predictions
prediction_data = pd.DataFrame({'CO(ppm)': predictions})

# Export the predicted data to a CSV file
prediction_data.to_csv('predicted_data.csv', index=False)


In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Load the datasets
book_i4_data = pd.read_csv('Booki4.csv')
book_i48_data = pd.read_csv('Booki48.csv')
book_1_data = pd.read_csv('Book1.csv')
book_3_data = pd.read_csv('Book3.csv')
book_5_data = pd.read_csv('Book5.csv')
testing_data = pd.read_csv('testing_set.csv')

# Replace blank spaces with NaN values
book_i4_data.replace(' ', float('nan'), inplace=True)
book_i48_data.replace(' ', float('nan'), inplace=True)
book_1_data.replace(' ', float('nan'), inplace=True)
book_3_data.replace(' ', float('nan'), inplace=True)
book_5_data.replace(' ', float('nan'), inplace=True)
testing_data.replace(' ', float('nan'), inplace=True)

# Merge the datasets
merged_data = pd.concat([book_1_data[['R5 (MOhm)', 'R6 (MOhm)', 'R7 (MOhm)', 'R8 (MOhm)', 'R13 (MOhm)', 'R14 (MOhm)']],
                        book_3_data[['R9 (MOhm)', 'R10 (MOhm)', 'R11 (MOhm)', 'R12 (MOhm)']],
                        book_5_data[['R13 (MOhm)', 'R14 (MOhm)', 'Flow rate (mL/min)', 'Heater voltage (V)']],
                        book_i4_data[['Time (s)', 'CO (ppm)', 'Humidity (%r.h.)', 'Temperature (C)']],
                        book_i48_data[['Flow rate (mL/min)', 'Heater voltage (V)', 'R1 (MOhm)', 'R2 (MOhm)', 'R3 (MOhm)', 'R4 (MOhm)']]], axis=1)

# Prepare the training and testing data
X = merged_data.drop("CO (ppm)", axis=1)
y = merged_data["CO (ppm)"]
X_test = testing_data[X.columns]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# Split the data into training and testing sets
X_train, X_test_split, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Identify indices with missing values in y_train
missing_indices = y_train.isnull()

# Drop corresponding rows from X_train and y_train
X_train = X_train[~missing_indices]
y_train = y_train[~missing_indices]

# Train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate model accuracy on training set
y_train_pred = model.predict(X_train)
train_r2_score = r2_score(y_train, y_train_pred)
print("Training r2 score:", train_r2_score)


Training r2 score: 0.9915313150554025
