# Using PySpark to predic if a compound is ab active inhibitor


This notebook is avaliable on Google Colab
https://colab.research.google.com/drive/11lfOdUjGPZ4jMcPjkbwCb512sRpwpyo8?usp=sharing



In [None]:
# pip install pyspark

In [None]:
import pyspark

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/data_classification_smote.csv', index_col=[0])
df.head()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.appName('G9a_clsf').getOrCreate()

In [None]:
spark

In [None]:
df_pyspark = spark.read.option('header','true').csv('/content/drive/MyDrive/data_classification_smote.csv', inferSchema=True)
df_pyspark = df_pyspark.drop('_c0')

In [None]:
df_pyspark.show(5)

In [None]:
df_pyspark.printSchema()

In [None]:
df_g9a=spark.read \
      .option("header","True")\
      .option("inferSchema","True")\
      .option("sep",",")\
      .csv('/content/drive/MyDrive/data_classification_smote_noSolub.csv')

df_g9a = df_g9a.drop('_c0')

In [None]:
print("There are", df_g9a.count(),
      "rows", len(df_g9a.columns),
      "columns in the data.")

In [None]:
target_feature = 'target'
indep_features = [col for col in df_g9a.columns if col not in [target_feature]]

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler

In [None]:
# empty stages list for pipeline
stages_list = []

In [None]:
# Assemble features into a vector
assembler = VectorAssembler(inputCols=indep_features, outputCol="features")

In [None]:
# update stages list
stages_list += [assembler]

In [None]:
# Create a pipeline with the assembler and other transformations
pipeline = Pipeline(stages=stages_list)

In [None]:
# Fit the pipeline on the training data
df_pipeline= pipeline.fit(df_g9a)

In [None]:
# Transform the training data
df_g9a_transformed = df_pipeline.transform(df_g9a)

In [None]:
# Split the transformed data into training and testing sets
train_data, test_data = df_g9a_transformed.randomSplit([0.8, 0.2], seed=1234)

In [None]:
train_data.show(n=5)

In [None]:
from pyspark.ml.classification import RandomForestClassifier

# Initialize the Random Forest Classifier model
rfc = RandomForestClassifier(featuresCol="features", labelCol=target_feature, numTrees=3, maxDepth=2)\
    .fit(train_data)

# Make predictions on the test set
rfc_predictions = rfc.transform(test_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
area_under_curve = evaluator.evaluate(rfc_predictions)
print(f"Area under ROC curve: {area_under_curve}")

In [None]:
import matplotlib.pyplot as plt

trainingSummary = rfc.summary
lrROC = trainingSummary.roc.toPandas()

plt.plot(lrROC['FPR'],lrROC['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))


In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()