In [9]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, monotonically_increasing_id
from pyspark.sql import SparkSession

# Initialize a Spark session

spark = SparkSession.builder.appName("SparkStandaloneTest").getOrCreate()

spark = SparkSession.builder \
    .appName("GCSFilesRead") \
    .config("spark.executor.cores", "120") \
    .config("spark.num.executors", "6") \
    .getOrCreate() 

# file path 
file_location = "/home/bx2051/dataset1.csv"

def read_csv_with_inferred_schema(file_path):
    """
    Reads a CSV file with inferred schema.
    transfer string data type to factor

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pyspark.sql.DataFrame: The DataFrame with inferred schema.
    """
    # Read the CSV file with inferred schema
    data = spark.read \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .csv(file_path)
    
    
    return data


df = read_csv_with_inferred_schema(file_location)

In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("Feature Selection with Random Forest").getOrCreate()

# Function to train Random Forest model
def train_random_forest_model(data, feature_cols, label_col='glucose'):
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    data_transformed = assembler.transform(data)
    
    # Split data into training and test sets
    train_data, test_data = data_transformed.randomSplit([0.66, 0.34], seed=1)
    
    rf = RandomForestRegressor(featuresCol="features", labelCol=label_col)
    model = rf.fit(train_data)
    return model, train_data, test_data  

# Function to get sorted feature importance
def get_feature_importance(rf_model, feature_cols):
    importances = rf_model.featureImportances
    feature_importance_list = [(feature, importances[idx]) for idx, feature in enumerate(feature_cols)]
    feature_importance_list.sort(key=lambda x: x[1], reverse=True)
    return feature_importance_list 

# Function to find the best feature set by RMSE
def find_best_feature_set(data, label_col='glucose'):
    feature_cols = data.columns[:-1]
    initial_features = feature_cols.copy()
    best_rmse = float('inf')
    best_data = None

    rmse_list = []

    # Initial model to get feature importances
    initial_model, _, _ = train_random_forest_model(data, initial_features, label_col)
    feature_importances = get_feature_importance(initial_model, initial_features)

    for iteration in range(len(feature_cols) - 2):
        model, train_data, test_data = train_random_forest_model(data, initial_features, label_col)
        predictions = model.transform(test_data)
        
        evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="rmse")
        current_rmse = evaluator.evaluate(predictions)
        rmse_list.append((len(initial_features), current_rmse))

        if current_rmse < best_rmse:
            best_rmse = current_rmse
            best_features = initial_features.copy()
            best_data = train_data.select(initial_features)

        # Remove the least important feature based on initial importances
        initial_features.remove(feature_importances[-1][0])
        feature_importances.pop(-1)  # Update the importance list
        data = data.select(*initial_features, label_col)  # Update the data to exclude the least important feature
    
    return best_data, feature_importances, rmse_list



In [11]:

# # Example usage:
# myData = df 
# best_data, feature_importances, rmses = find_best_feature_set(myData)

# # Print results
# print("Feature Importances:")
# for importance in feature_importances:
#     print(importance)

# print("\nRMSE by Iteration:")
# for rmse in rmses:
#     print(rmse)

# best_data.show()

In [12]:
# initial_model, _, _ = train_random_forest_model(myData, myData.columns[:-1])
# feature_importances = get_feature_importance(initial_model, myData.columns[:-1])


In [13]:
# print("Feature Importances:")
# for importance in feature_importances:
#     print(importance)

In [14]:
# import pandas as pd

# # Convert feature importances and RMSE list to DataFrame
# feature_importance_df = pd.DataFrame(feature_importances, columns=['Feature', 'Importance'])
# rmse_df = pd.DataFrame(rmses, columns=['Number of Features', 'RMSE'])

# # Save DataFrames to CSV for use in papers
# feature_importance_df.to_csv("feature_importances.csv", index=False)
# rmse_df.to_csv("rmse_by_iteration.csv", index=False)

# # Optional: Save to LaTeX (if you're using LaTeX for your paper)
# feature_importance_df.to_latex("feature_importances.tex", index=False)
# rmse_df.to_latex("rmse_by_iteration.tex", index=False)

# # Print DataFrames to screen (optional, for quick verification)
# print("Feature Importances:\n", feature_importance_df)
# print("\nRMSE by Iteration:\n", rmse_df)


In [15]:
# best_feature_columns = best_data.columns

# # Convert column names to a DataFrame
# best_features_df = pd.DataFrame(best_feature_columns, columns=['Best Features'])

# # Save the DataFrame to CSV
# best_features_df.to_csv("best_features.csv", index=False)

In [16]:
def separate_datasets_by_glucose(dataset, glucose_threshold=5.5):
    """
    Separates the dataset into two datasets based on the second year glucose number using PySpark.

    Args:
        dataset (pyspark.sql.DataFrame): The original dataset.
        glucose_threshold (float): The threshold value for separating the datasets
            based on the second year glucose number. Default is 6.1.

    Returns:
        tuple: A tuple containing two datasets:
            - normal_dataset (pyspark.sql.DataFrame): Dataset with glucose_year2 < glucose_threshold.
            - abnormal_dataset (pyspark.sql.DataFrame): Dataset with glucose_year2 >= glucose_threshold.
    """
    normal_dataset = dataset.filter(dataset['sy_glucose'] < glucose_threshold)
    abnormal_dataset = dataset.filter(dataset['sy_glucose'] >= glucose_threshold)
    return normal_dataset, abnormal_dataset

In [17]:
normal_people, abnormal_people = separate_datasets_by_glucose(df)


## Normal People Feature Selection
best_data, feature_importances, rmses = find_best_feature_set(normal_people)


print("\nRMSE by Iteration:")
for rmse in rmses:
    print(rmse)

best_data.show()
initial_model, _, _ = train_random_forest_model(normal_people, normal_people.columns[:-1])
feature_importances = get_feature_importance(initial_model, normal_people.columns[:-1])

print("Feature Importances:")
for importance in feature_importances:
    print(importance)


RMSE by Iteration:
(159, 0.32452126299610096)
(158, 0.3246438279471127)
(157, 0.3219097692089593)
(156, 0.3181887567882844)
(155, 0.3199457893314053)
(154, 0.327116450451998)
(153, 0.32289424413421863)
(152, 0.3229180206843173)
(151, 0.324522980062626)
(150, 0.3217675501701615)
(149, 0.32276705821512114)
(148, 0.3213822182768603)
(147, 0.3196356072171749)
(146, 0.32505463095733744)
(145, 0.32109491548246016)
(144, 0.319315482713642)
(143, 0.31864555112166)
(142, 0.3202630283668023)
(141, 0.32602819090989893)
(140, 0.326439907607703)
(139, 0.32745715903377176)
(138, 0.3244240909078629)
(137, 0.32301491785759795)
(136, 0.3251329591174945)
(135, 0.3225575224860505)
(134, 0.3184169646121533)
(133, 0.31664853061536835)
(132, 0.32063448262302463)
(131, 0.3144129773743319)
(130, 0.31531202276652176)
(129, 0.3147808813403811)
(128, 0.31773000176758254)
(127, 0.31960510186046553)
(126, 0.321097041795312)
(125, 0.3174686104342564)
(124, 0.32149315610349366)
(123, 0.31695785311398994)
(122, 0.31

24/05/15 10:25:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---+----------+-------------+---------------+---------------------------------------+------------+------------+------------------------------+----------+---------------+---------------+------------+-------------+
|age|fy_glucose|fy_creatinine|fy_trioxypurine|fy_high_density_lipoprotein_cholesterol|fy_3000_left|fy_4000_left|sy_absolute_value_of_basophils|sy_glucose|sy_trioxypurine|sy_triglyceride|sy_4000_left|sy_2000_right|
+---+----------+-------------+---------------+---------------------------------------+------------+------------+------------------------------+----------+---------------+---------------+------------+-------------+
| 24|      0.15|  0.429357798|    0.410060976|                            0.384297521| 0.117647059| 0.052631579|                   0.176470588|       4.9|    0.317230274|    0.029863481| 0.111111111|        0.125|
| 24|      0.16|  0.422018349|    0.410060976|                            0.483471074| 0.058823529| 0.052631579|                   0.176470588| 

In [18]:
import pandas as pd

# Convert feature importances and RMSE list to DataFrame
feature_importance_df = pd.DataFrame(feature_importances, columns=['Feature', 'Importance'])
rmse_df = pd.DataFrame(rmses, columns=['Number of Features', 'RMSE'])

# Save DataFrames to CSV for use in papers
feature_importance_df.to_csv("feature_importances.csv", index=False)
rmse_df.to_csv("rmse_by_iteration.csv", index=False)

# Optional: Save to LaTeX (if you're using LaTeX for your paper)
feature_importance_df.to_latex("feature_importances.tex", index=False)
rmse_df.to_latex("rmse_by_iteration.tex", index=False)

# Print DataFrames to screen (optional, for quick verification)
print("Feature Importances:\n", feature_importance_df)
print("\nRMSE by Iteration:\n", rmse_df)
best_feature_columns = best_data.columns

# Convert column names to a DataFrame
best_features_df = pd.DataFrame(best_feature_columns, columns=['Best Features'])

# Save the DataFrame to CSV
best_features_df.to_csv("best_features.csv", index=False)

Feature Importances:
                                  Feature  Importance
0                             fy_glucose    0.287576
1                             sy_glucose    0.185353
2                                    age    0.027424
3                           fy_3000_left    0.014273
4                           sy_4000_left    0.014254
..                                   ...         ...
154                        fy_2000_right    0.000000
155              sy_large_platelet_ratio    0.000000
156  sy_glutamic_oxalacetic_transaminase    0.000000
157                         sy_3000_left    0.000000
158                        sy_6000_right    0.000000

[159 rows x 2 columns]

RMSE by Iteration:
      Number of Features      RMSE
0                   159  0.324521
1                   158  0.324644
2                   157  0.321910
3                   156  0.318189
4                   155  0.319946
..                  ...       ...
152                   7  0.304522
153                   6  

In [19]:
## Abnormal People feature selection 
# Example usage:
best_data, feature_importances, rmses = find_best_feature_set(abnormal_people)

# Print results
print("Feature Importances:")
for importance in feature_importances:
    print(importance)

print("\nRMSE by Iteration:")
for rmse in rmses:
    print(rmse)

best_data.show()
initial_model, _, _ = train_random_forest_model(abnormal_people, abnormal_people.columns[:-1])
feature_importances = get_feature_importance(initial_model, abnormal_people.columns[:-1])

print("Feature Importances:")
for importance in feature_importances:
    print(importance)

Feature Importances:
('fy_glucose', 0.2986515227498077)
('sy_glucose', 0.1593994611398996)

RMSE by Iteration:
(159, 0.9955741080607478)
(158, 0.9903305882331017)
(157, 0.9893400812222163)
(156, 0.9634036693998633)
(155, 0.9854409908292406)
(154, 0.9604134051560624)
(153, 0.9627342989245797)
(152, 0.9728766366764358)
(151, 0.981579993713069)
(150, 1.0033754280804585)
(149, 0.9581916096170493)
(148, 1.0036881790082117)
(147, 0.9882688302807698)
(146, 0.9705135882504436)
(145, 0.9643586887745659)
(144, 0.9832310256686964)
(143, 0.9840750447636393)
(142, 0.9930056694892901)
(141, 0.9509899139004121)
(140, 0.9371858704517798)
(139, 0.9985207005472866)
(138, 0.9859488454306121)
(137, 0.9984032765618044)
(136, 1.002664582490392)
(135, 1.0085871880185986)
(134, 0.9920431347077772)
(133, 0.9953691089585951)
(132, 0.974143952765751)
(131, 0.9519895917709076)
(130, 0.95322275095418)
(129, 1.0225799106532751)
(128, 0.9849514932720395)
(127, 0.9687256034425996)
(126, 0.9758392461430682)
(125, 0.97

In [20]:

# Convert feature importances and RMSE list to DataFrame
feature_importance_df = pd.DataFrame(feature_importances, columns=['Feature', 'Importance'])
rmse_df = pd.DataFrame(rmses, columns=['Number of Features', 'RMSE'])

# Save DataFrames to CSV for use in papers
feature_importance_df.to_csv("feature_importances_ab.csv", index=False)
rmse_df.to_csv("rmse_by_iteration_ab.csv", index=False)

# Print DataFrames to screen (optional, for quick verification)
print("Feature Importances:\n", feature_importance_df)
print("\nRMSE by Iteration:\n", rmse_df)
best_feature_columns = best_data.columns

# Convert column names to a DataFrame
best_features_df = pd.DataFrame(best_feature_columns, columns=['Best Features'])

# Save the DataFrame to CSV
best_features_df.to_csv("best_features_ab.csv", index=False)

Feature Importances:
                                                Feature  Importance
0                                           fy_glucose    0.298652
1                                           sy_glucose    0.159399
2    fy_whole_blood_viscosity_values_2Pas_shear_rat...    0.036674
3                                         fy_carbamide    0.025523
4    fy_whole_blood_viscosity_values_1pas_the_shear...    0.021411
..                                                 ...         ...
154                                      sy_proportion    0.000000
155                                        sy_ph_value    0.000000
156                                       sy_2000_left    0.000000
157                                       sy_4000_left    0.000000
158                                      sy_6000_right    0.000000

[159 rows x 2 columns]

RMSE by Iteration:
      Number of Features      RMSE
0                   159  0.995574
1                   158  0.990331
2                   157  0.