In [3]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error


# 1. Data Loading and Preprocessing


In [4]:

# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '40cb260c-04c0-4f19-90c8-a708b6e6b8f1'
resource_group = 'smartcredit'
workspace_name = 'smartcreditml'

workspace = Workspace(subscription_id, resource_group, workspace_name)

merged_data = Dataset.get_by_name(workspace, name='smartcreditDW')
merged_data.to_pandas_dataframe()

Performing interactive authentication. Please follow the instructions on the terminal.


The default web browser has been opened at https://login.microsoftonline.com/organizations/oauth2/v2.0/authorize. Please continue the login in the web browser. If no web browser is available or if the web browser fails to open, use device code flow with `az login --use-device-code`.


Interactive authentication successfully completed.


2023-11-13 14:24:17.979449 | ActivityCompleted: Activity=_dataflow, HowEnded=Failure, Duration=0.2 [ms], Info = {'activity_id': '10ee88e7-752f-4a37-aa04-8a9339e76b26', 'activity_name': '_dataflow', 'activity_type': 'InternalCall', 'app_name': 'dataset', 'source': 'azureml.dataset', 'version': '1.54.0', 'dataprepVersion': '', 'sparkVersion': '', 'subscription': '', 'run_id': '', 'resource_group': '', 'workspace_name': '', 'experiment_id': '', 'location': '', 'completionStatus': 'Success', 'durationMs': 0.08}, Exception=ImportError; Missing required package "azureml-dataset-runtime", which can be installed by running: "/usr/local/bin/python3" -m pip install azureml-dataset-runtime --upgrade
Failed to fetch RSLex YAML representation for dataset=unregistered-73779a88-9150-49dd-8adb-9900df90a420 from workspace=Workspace.create(name='smartcreditml', subscription_id='40cb260c-04c0-4f19-90c8-a708b6e6b8f1', resource_group='smartcredit')=, got error: 'Missing required package "azureml-dataset-ru

ImportError: Missing required package "azureml-dataset-runtime", which can be installed by running: "/usr/local/bin/python3" -m pip install azureml-dataset-runtime --upgrade

# 2. Feature Engineering

In [None]:
# One-hot encoding categorical columns ( encoding to categorical variables)
encoded_data = pd.get_dummies(merged_data, columns=['home_ownership', 'emp_length'])

# Splitting data and scaling features (split our data into training and test sets and scale, to normalizing data)
X = encoded_data.drop(columns=['label']) 
y = merged_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Handle NaN values
if np.isnan(X_train_scaled).any():
    nan_columns = np.where(np.isnan(X_train_scaled).any(axis=0))[0]
    for col in nan_columns:
        col_mean = np.nanmean(X_train_scaled[:, col])
        X_train_scaled[np.isnan(X_train_scaled[:, col]), col] = col_mean
        X_test_scaled[np.isnan(X_test_scaled[:, col]), col] = col_mean

# 3. Model Training and Evaluation

In [None]:
# Train logistic regression model
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(X_train_scaled, y_train)

# Evaluate model
y_pred = classifier.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)