In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline

# Create a Spark session
spark = SparkSession.builder.appName("ColumnFiltering").getOrCreate()

# Adjust the file path
raw_file_path = "C:/Users/LLR User/Downloads/Dementia/Dementia/JanBDRcount.raw"

# List of columns to keep
columns_to_keep = [
    "FID", "PAT", "IID", "MAT", "SEX", "PHENOTYPE",
    "rs6656401", "rs6733839", "rs35349669", "rs190982", "rs9271192",
    "rs10948363", "rs1476679", "rs11771145", "rs2718058", "rs28834970",
    "rs9331896", "rs11218343", "rs10838725", "rs983392", "rs10792832",
    "rs17125944", "rs10498633", "rs4147929", "rs7274581", "rs429358", "rs7412"
]

# Read the text file and split each line into individual columns based on space delimiter
df = spark.read.text(raw_file_path)
df = df.withColumn("columns", split(df["value"], " "))

# Select only the columns to keep
df = df.selectExpr([f"columns[{i}] as {col}" for i, col in enumerate(columns_to_keep)])

# Convert columns to StringType
for col in columns_to_keep:
    df = df.withColumn(col, df[col].cast("string"))

# Define indexers for categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="keep") for col in columns_to_keep]

# Define encoder
encoder = OneHotEncoder(inputCols=[f"{col}_index" for col in columns_to_keep], 
                        outputCols=[f"{col}_encoded" for col in columns_to_keep])

# Create a Pipelineb
pipeline = Pipeline(stages=indexers + [encoder])

# Fit the pipeline to the data
pipeline_model = pipeline.fit(df) 

# Apply the transformation
df_transformed = pipeline_model.transform(df)

# Show the DataFrame
df_transformed.show()


+-------------------+-----+---+---+---+---------+-----------+------------+-----------+-----------+------------+------------+-----------+-----------+-----------+-------------+-----------+-----------+-----------+-----------+-------------+-----------+-----------+-----------+-----------+------------+-----------+---------+---------+---------+---------+---------+---------------+---------------+---------------+----------------+--------------+---------------+----------------+---------------+----------------+---------------+----------------+---------------+----------------+----------------+--------------+----------------+----------------+----------------+---------------+---------------+--------------+------------+-----------------+-----------------+-------------+-------------+-------------+-----------------+-----------------+-----------------+------------------+----------------+-----------------+------------------+-----------------+------------------+-----------------+------------------+-------

In [2]:
pip install setuptools

Collecting setuptools
  Using cached setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Using cached setuptools-69.5.1-py3-none-any.whl (894 kB)
Installing collected packages: setuptools
Successfully installed setuptools-69.5.1
Note: you may need to restart the kernel to use updated packages.


ONE 

In [26]:
from pyspark.ml.feature import VectorAssembler

# Define input columns (excluding non-feature columns like FID, PAT, IID, etc.)
input_cols1 = [
    "SEX_encoded", "rs6656401_encoded", "rs6733839_encoded", "rs35349669_encoded", 
    "rs190982_encoded", "rs9271192_encoded", "rs10948363_encoded", "rs1476679_encoded", 
    "rs11771145_encoded", "rs2718058_encoded", "rs28834970_encoded", "rs9331896_encoded", 
    "rs11218343_encoded", "rs10838725_encoded", "rs983392_encoded", "rs10792832_encoded", 
    "rs17125944_encoded", "rs10498633_encoded", "rs4147929_encoded", "rs7274581_encoded", 
    "rs429358_encoded", "rs7412_encoded"
]

# Assemble the input features into a single vector column with a new name
assembler = VectorAssembler(inputCols=input_cols1, outputCol="features_new2")

# Apply the vector assembler to the transformed DataFrame
df_transformed = assembler.transform(df_transformed)

# Now split the transformed DataFrame into train and test sets
train_data, test_data = df_transformed.randomSplit([0.60, 0.40], seed=12345)


In [27]:
print("Train data count:", train_data.count())
print("Test data count:", test_data.count())

Train data count: 325
Test data count: 210


In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define the RandomForestClassifier with the featuresCol as "features_new" and labelCol as "PHENOTYPE_index"
rf = RandomForestClassifier(featuresCol="features_new", labelCol="PHENOTYPE_index")

# Train the model
rf_model = rf.fit(train_data)

# Make predictions
predictions = rf_model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="PHENOTYPE_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = {:.2f}%".format(accuracy * 100))


Test Accuracy = 65.71%


In [19]:
from pyspark.ml.classification import LogisticRegression

# Define the LogisticRegression model
lr = LogisticRegression(featuresCol="features_new", labelCol="PHENOTYPE_index")

# Train the model
lr_model = lr.fit(train_data)

# Make predictions
predictions_lr = lr_model.transform(test_data)

# Evaluate the model
accuracy_lr = evaluator.evaluate(predictions_lr)
print("Logistic Regression Test Accuracy = {:.2f}%".format(accuracy_lr * 100))


Logistic Regression Test Accuracy = 62.09%


In [20]:
from pyspark.ml.classification import DecisionTreeClassifier

# Define the DecisionTreeClassifier model
dt = DecisionTreeClassifier(featuresCol="features_new", labelCol="PHENOTYPE_index")

# Train the model
dt_model = dt.fit(train_data)

# Make predictions
predictions_dt = dt_model.transform(test_data)

# Evaluate the model
accuracy_dt = evaluator.evaluate(predictions_dt)
print("Decision Tree Classifier Test Accuracy = {:.2f}%".format(accuracy_dt * 100))

Decision Tree Classifier Test Accuracy = 68.13%
