In [None]:
storage_account_name = "containername"
storage_account_access_key = "accesskey"
file_location = "wasbs://mlstorage@" + storage_account_name + ".blob.core.windows.net/PreppedMLData.csv"
file_type = "csv"

spark.conf.set(
  "fs.azure.account.key." + storage_account_name + ".blob.core.windows.net",
  storage_account_access_key)

remote_table = spark.read.format(file_type).option("inferSchema", "true").option("header", "true").load(file_location)




In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming 'remote_table' is a Spark DataFrame available in your Databricks environment

# Drop the columns with all null values
columns_to_drop = ['Satisfactionscore', 'CardType', 'PointEarned']
remote_table = remote_table.drop(*columns_to_drop)

# Handle remaining missing values
remote_table = remote_table.na.drop()

# Categorical columns
categorical_cols = ['Geography', 'Gender']

# String Indexing for categorical features
indexers = [StringIndexer(inputCol=column, outputCol=column + "_indexed") for column in categorical_cols]

# Numeric columns for scaling
numeric_cols = ["Age", "Tenure", "Balance", "NumOfProducts", "HasCrCard", "EstimatedSalary"]

# VectorAssembler for combining feature columns
assembler = VectorAssembler(
    inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_cols,
    outputCol="features")

# MinMaxScaler for feature scaling
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Pipeline for the tasks
pipeline = Pipeline(stages=indexers + [assembler, scaler])

# Fit and Transform the pipeline on the dataframe
transformed_df = pipeline.fit(remote_table).transform(remote_table)

# Selecting features and target variable
final_df = transformed_df.select("scaledFeatures", "CreditScore")

# Convert to Pandas DataFrame for Scikit-learn compatibility
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
pandas_df = final_df.toPandas()

# Splitting the dataset into training and testing sets
X = pd.DataFrame(pandas_df["scaledFeatures"].tolist())
y = pandas_df["CreditScore"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you can use Scikit-learn for model training and evaluation

Downloading artifacts:   0%|          | 0/25 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Assuming 'final_df' is your PySpark DataFrame that includes 'scaledFeatures' and 'creditscore'
# Convert the Spark DataFrame to a Pandas DataFrame
pandas_df = final_df.toPandas()

# Convert the DenseVector to lists
pandas_df['scaledFeatures'] = pandas_df['scaledFeatures'].apply(lambda x: x.toArray().tolist())

# Splitting the dataset into training and testing sets
X = pd.DataFrame(pandas_df['scaledFeatures'].tolist())
y = pandas_df['CreditScore']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test data
predictions = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2 ): {r2}")

# Create a DataFrame with actual and predicted values
results_df = pd.DataFrame({
    'Actual Credit Score': y_test,
    'Predicted Credit Score': predictions
})

# Reset index to avoid index-related issues
results_df = results_df.reset_index(drop=True)

# Print the DataFrame
print(results_df)

# Convert the Pandas DataFrame to a PySpark DataFrame
results_spark_df = spark.createDataFrame(results_df)



Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mean Squared Error (MSE): 10391.90274
Root Mean Squared Error (RMSE): 101.94068245798631
R-squared (R2 ): -0.25139004881514504
    Actual Credit Score  Predicted Credit Score
0                   758                  700.92
1                   493                  561.29
2                   479                  655.16
3                   813                  633.50
4                   738                  555.15
5                   603                  601.62
6                   722                  655.30
7                   625                  553.92
8                   653                  595.69
9                   619                  560.01
10                  574                  609.93
11                  637                  644.92
12                  668                  669.00
13                  601                  580.76
14                  663                  773.44
15                  822                  596.79
16                  699                  703.36
17       