In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -qq /content/drive/MyDrive/DS_Assignment_internship.zip

In [3]:
!pip install striprtf

Collecting striprtf
  Downloading striprtf-0.0.27-py3-none-any.whl.metadata (2.3 kB)
Downloading striprtf-0.0.27-py3-none-any.whl (7.6 kB)
Installing collected packages: striprtf
Successfully installed striprtf-0.0.27


In [4]:
# the file given is in rtf format
# we need json data
# hence need to convert the rtf to json and extract the data
# into python dictonary format

import json
from striprtf.striprtf import rtf_to_text
import logging

# Initializing the logger
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Here I define a fucntion to parse json from rtf

def parse_json_from_rtf(rtf_file_path):
  try:
    # read the rtf file
    with open(file_path, 'r', encoding = 'utf-8') as rtf_file:
      rtf_content = rtf_file.read()

    # converting the rtf to plain text
    plain_text = rtf_to_text(rtf_content)

    # load json from plain text
    json_data = json.loads(plain_text)

    # returning the json data
    return json_data
  except json.JSONDecodeError as e :
    print(f"Error parsing JSON from RTF: {e}")
    return None
  except Exception as e:
    print(f"An error occured: {e}")
    return None

# giving file path to the function and calling the function
file_path = '/content/Screening_Test_DS/algoparams_from_ui.json.rtf'
json_data = parse_json_from_rtf(file_path)

if json_data:
  print("Parsed JSON data:")
  print(json.dumps(json_data, indent = 4))
else:
  print("Failed to parse JSON")

Parsed JSON data:
{
    "session_name": "test",
    "session_description": "test",
    "design_state_data": {
        "session_info": {
            "project_id": "1",
            "experiment_id": "kkkk-11",
            "dataset": "iris_modified.csv",
            "session_name": "test",
            "session_description": "test"
        },
        "target": {
            "prediction_type": "Regression",
            "target": "petal_width",
            "type": "regression",
            "partitioning": true
        },
        "train": {
            "policy": "Split the dataset",
            "time_variable": "sepal_length",
            "sampling_method": "No sampling(whole data)",
            "split": "Randomly",
            "k_fold": false,
            "train_ratio": 0,
            "random_seed": 0
        },
        "metrics": {
            "optomize_model_hyperparameters_for": "AUC",
            "optimize_threshold_for": "F1 Score",
            "compute_lift_at": 0,
            "cost_mat

In [5]:
# Step 1: To read target and regression type
# For this step I have already parsed the json data
# Now I only need to read the data
# I can do that by creating a fucntion for it

# Here I have defined the function and given input as the json data
# that I extracted before

# Also the target and type are nested under the design state data
# hence I will have to consider that

def parse_target_and_regression(json_data):
  try:
    if "design_state_data" not in json_data and "target" not in json_data["design_state_data"]:
      raise ValueError("Json data is missing with keys 'design_state_data' and 'target'")

    # Now I need to navigate to the target in the dictionary
    target_data = json_data["design_state_data"]["target"]

    target = target_data.get("target")
    regression_type = target_data.get("type")

    if target and regression_type:
      logger.info(f"Target: {target}")
      logger.info(f"Regression Type: {regression_type}")
    else:
      logging.error("Failed to parse target and type")

    # lets validate both target and type
    if not target:
      raise ValueError(" 'target' filed missing in json data")
    if not regression_type:
      raise ValueError(" 'prediction_type' field missing in json data")

    return target, regression_type
  except Exception as e:
    logger.error(f"Error parsing target and type : {e}")
    return None, None

# Here I call the function
target, regression_type = parse_target_and_regression(json_data)


INFO:__main__:Target: petal_width
INFO:__main__:Regression Type: regression


In [6]:
# Step 2:
# Here I have to read the csv as well hence I will be importing pandas
# also from the parsed json data will look at the feature handling to
# determine the imputation strategy for each feature and apply it to the dataframe

import pandas as pd

# for this function I have given inputs as json data which is parsed data and dataframe

def parse_and_impute_data(df, json_data):
  try:
    # I will have to extract the feature handling details
    feature_handling = json_data["design_state_data"]["feature_handling"]

    # Here I will apply the imputation strategies
    for feature_name, feature_info in feature_handling.items():
      if feature_name in df.columns:
        # getting the feature name
        feature_details = feature_info.get("feature_details", {})

        # getting the impation method and imputation value
        imputation_method = feature_details.get("impute_with")
        imputation_value = feature_details.get("impute_value")

        # validate the extracted values
        print(f"processing feature : {feature_name}")
        print(f"Imputation method : {imputation_method}")
        print(f"Imputation value : {imputation_value}")

        # The json data shows that there are two imputation that is average of values
        # and custom. Hence lets apply those two methods of imputation
        if imputation_method == "Average of values":
            logger.info(f"Imputing {feature_name} with mean value. ")
            # applying mean imputation to dataframe
            df[feature_name] = df[feature_name].fillna(df[feature_name].mean())
        elif imputation_method == "custom":
            logger.info(f"Imputing {feature_name} with custom value: {imputation_value}")
            # applying custom imputation to dataframe
            df[feature_name] = df[feature_name].fillna(imputation_value)
        else:
            logger.warning(f"No valid imputation strategy for {feature_name}")
      else:
        logger.warning(f"Feature {feature_name} not found in dataset")

    return df
  except Exception as e:
    logger.error(f"Error occured during the process: {e}")
    return None

# Now lets specify the csv path
# The path can be changed based on the current working directory
csv_path = '/content/Screening_Test_DS/iris.csv'
df = pd.read_csv(csv_path)

# lets call the function
processed_df = parse_and_impute_data(df, json_data)

if processed_df is not None:
  logger.info("Data processing completed successfulyy")
  print(processed_df.head(20))
else:
  logger.error("Data processing failed")


INFO:__main__:Imputing sepal_length with mean value. 
INFO:__main__:Imputing sepal_width with custom value: -1
INFO:__main__:Imputing petal_length with mean value. 
INFO:__main__:Imputing petal_width with custom value: -2
INFO:__main__:Data processing completed successfulyy


processing feature : sepal_length
Imputation method : Average of values
Imputation value : 0
processing feature : sepal_width
Imputation method : custom
Imputation value : -1
processing feature : petal_length
Imputation method : Average of values
Imputation value : 0
processing feature : petal_width
Imputation method : custom
Imputation value : -2
processing feature : species
Imputation method : None
Imputation value : None
    sepal_length  sepal_width  petal_length  petal_width      species
0            5.1          3.5           1.4          0.2  Iris-setosa
1            4.9          3.0           1.4          0.2  Iris-setosa
2            4.7          3.2           1.3          0.2  Iris-setosa
3            4.6          3.1           1.5          0.2  Iris-setosa
4            5.0          3.6           1.4          0.2  Iris-setosa
5            5.4          3.9           1.7          0.4  Iris-setosa
6            4.6          3.4           1.4          0.3  Iris-setosa
7           

In [7]:
# Step 3:
# Here I have to compute feture reduction based on the input
# Currently tree based feature reduction is used
# I have to make sure to keep options for No reduction, corr with Target and PCA as well

# lets import the necessary libraries for all the above options to work
# pandas is already imported so no need to import again

import numpy as np
from sklearn.decomposition import PCA  # library for PCA
from sklearn.ensemble import RandomForestRegressor # library for Tree based
from sklearn.feature_selection import mutual_info_regression # library for corr with target
from sklearn.preprocessing import LabelEncoder


# if we check the csv the last column in the dataset is a non-numerical value
# hence the reduction wont work
# I will convert those non-numerical values using label encoding

def label_encode_target(df):
  # initializing the empty dict
  label_encoders = {}
  for col in df.select_dtypes(include =['object', 'category']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    return df, label_encoders

# considering the json data feature reduction falls under
# design state data --> feature reduction -->  feature reduction method

# here I have to apply feature reduction on the dataframe so
# giving input as dataframe and json data
def feature_reduction(df, json_data):
  try:
    reduction_method = json_data["design_state_data"]["feature_reduction"]["feature_reduction_method"]
    target_data = json_data["design_state_data"]["target"]["target"]

    # Encode the non-numerical columns
    df, _ = label_encode_target(df)

    # lets check each reduction method and apply it to the data
    # first is no reduction method
    if reduction_method == "No Reduction":
      logger.info("No feature reduction applied")
      return df

    # second is corr with target
    elif reduction_method == "Corr with Target":
      # check if target data is present
      if target_data not in df.columns:
        raise ValueError(f"Target {target_data} not found in dataset")

      # if target data is found compute correlation
      correlations = df.corrwith(df[target_data])

      # Keep top 5 correlation features
      top_features = correlations.abs().nlargest(5).index.tolist()
      logger.info(f"Selected top features : {top_features}")

      return df[top_features]

    # third is tree-based
    elif reduction_method == "Tree-based":
      # convert the num of features, num of trees and depth of trees to integer
      num_features = int(json_data["design_state_data"]["feature_reduction"]["num_of_features_to_keep"])
      num_trees = int(json_data["design_state_data"]["feature_reduction"]["num_of_trees"])
      max_depth = int(json_data["design_state_data"]["feature_reduction"]["depth_of_trees"])

      # check if target data is present
      if target_data not in df.columns:
        raise ValueError(f"Target {target_data} not present in dataset")

      # now we initialize the X, y for random forest regressor
      X = df.drop(columns = [target_data])
      y = df[target_data]

      # initialize the model
      model = RandomForestRegressor(n_estimators= num_trees, max_depth = max_depth)

      # fit the model
      model.fit(X,y)

      # get the important features from tree based
      importances = model.feature_importances_

      # get the feature importance dataframe
      feature_importance_df = pd.DataFrame({"feature": X.columns, "importance": importances})

      # get the top features
      top_features = feature_importance_df.nlargest(num_features, "importance")["feature"].tolist()
      logger.info(f"Selected top features: {top_features}")

      # return the dataframe
      return df[top_features + [target_data]]

    elif reduction_method == "PCA":
      # convert the num features to integer
      num_features = int(json_data["design_state_data"]["feature_reduction"]["num_of_features_to_keep"])

      # select the data for PCA
      X = df.select_dtypes(include = [np.number]).drop(columns = [target_data], errors = "ignore")

      # Initialize PCA
      pca = PCA(n_components= num_features)

      # fit PCA model
      pca_result = pca.fit_transform(X)

      # create dataframe for PCA components
      pca_df = pd.DataFrame(data = pca_result, columns = [f"PC{i + 1}" for i in range(num_features)])

      logger.info(f"PCA applied, number of components : {num_features}")

      if target_data in df.columns:
        pca_df[target_data] = df[target_data].values
        return pca_df

      else:
        raise ValueError(f"Invalid feature reduction method: {reduction_method}")

  except Exception as e:
    logger.error(f"Error occured during feature reduction: {e}")
    return None

# csv file path
csv_path = '/content/Screening_Test_DS/iris.csv'

df = pd.read_csv(csv_path)

reduced_df = feature_reduction(df, json_data)

if reduced_df is not None:
  logger.info("Feature reduction completed successfully")
  print(reduced_df.head(10))
else:
  logger.error("Feature reduction failed")






INFO:__main__:Selected top features: ['petal_length', 'species', 'sepal_width', 'sepal_length']
INFO:__main__:Feature reduction completed successfully


   petal_length  species  sepal_width  sepal_length  petal_width
0           1.4        0          3.5           5.1          0.2
1           1.4        0          3.0           4.9          0.2
2           1.3        0          3.2           4.7          0.2
3           1.5        0          3.1           4.6          0.2
4           1.4        0          3.6           5.0          0.2
5           1.7        0          3.9           5.4          0.4
6           1.4        0          3.4           4.6          0.3
7           1.5        0          3.4           5.0          0.2
8           1.4        0          2.9           4.4          0.2
9           1.5        0          3.1           4.9          0.1


In [35]:
# Step 4:
# Here the prediction type is given as regression
# hence I will have to create the model objects that can only handle regression algorithms
# The json data shows the regression algorithms used which are as follows:
# RandomForestRegressor, GBTRegressor, LinearRegression, LogisticRegression,
# RidgeRegression, LassoRegression, ElasticNetRegression, DecisionTreeRegressor.

# Lets import the necessary libraries
import inspect
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.tree import DecisionTreeRegressor

# Now I will define a fucntion to parse the JSON and create models

# Here we only require json data as input
def create_models(json_data):
  # We need to look at the prediction type and algorithms in the json data
  # prediction type is located at design_state_data --> target --> prediction_type
  # algorithms is located at design_state_data --> algorithms

  prediction_type = json_data["design_state_data"]["target"]["prediction_type"]
  algorithms = json_data["design_state_data"]["algorithms"]

  # Now lets define the mappings for each model and create a dict
  regression_model_mapping = {
      "RandomForestRegressor" : RandomForestRegressor,
      "GBTRegressor" : GradientBoostingRegressor,
      "LinearRegression" : LinearRegression,
      #"LogisticRegression" : LogisticRegression,
      "RidgeRegression" : Ridge,
      "LassoRegression" : Lasso,
      "ElasticNetRegression" : ElasticNet,
      "DecisionTreeRegressor" : DecisionTreeRegressor
  }

  # now lets initialize empty model dict
  models = {}
  if prediction_type == "Regression":

    for algo_name, algo_details in algorithms.items():

        # getting the model names
        model_class = regression_model_mapping.get(algo_name)

        # if model is not present in the model class then it is not a regression model
        if not model_class:
          logger.error(f" Algorithm {algo_name} is not a regression model")
          continue

        # creating valid params
        valid_params = inspect.signature(model_class).parameters

        # extracting the models
        model_params = {
                key : value for key, value in algorithms.items()
                if key in valid_params
            }
        # Lets map JSON filed to scikit-learn models
        if algo_name in regression_model_mapping:


          # lets ininitialize the models
          try :
            models[algo_name] = model_class(**model_params)
            logger.info(f"Model {algo_name} created successfully : {models[algo_name]}")
          except TypeError as e:
            print(f"Error creating model {algo_name} : {e}")
            models[algo_name] = None

        else:
          logger.error(f"Algorithm {algo_name} is not a regression model")
  else :
    logger.error(f"Invalid prediction type : {prediction_type}")
  return models

# lets call the function
models = create_models(json_data)

print("\nSummary of all models created : ")
# displaying the created models
for model_name, model_obj in models.items():
  if model_obj is not None:
    print(f"Created model : {model_name}")
  else:
    print(f"Failed to create model : {model_name}")



ERROR:__main__: Algorithm RandomForestClassifier is not a regression model
INFO:__main__:Model RandomForestRegressor created successfully : RandomForestRegressor()
ERROR:__main__: Algorithm GBTClassifier is not a regression model
INFO:__main__:Model GBTRegressor created successfully : GradientBoostingRegressor()
INFO:__main__:Model LinearRegression created successfully : LinearRegression()
ERROR:__main__: Algorithm LogisticRegression is not a regression model
INFO:__main__:Model RidgeRegression created successfully : Ridge()
INFO:__main__:Model LassoRegression created successfully : Lasso()
INFO:__main__:Model ElasticNetRegression created successfully : ElasticNet()
ERROR:__main__: Algorithm xg_boost is not a regression model
INFO:__main__:Model DecisionTreeRegressor created successfully : DecisionTreeRegressor()
ERROR:__main__: Algorithm DecisionTreeClassifier is not a regression model
ERROR:__main__: Algorithm SVM is not a regression model
ERROR:__main__: Algorithm SGD is not a regre


Summary of all models created : 
Created model : RandomForestRegressor
Created model : GBTRegressor
Created model : LinearRegression
Created model : RidgeRegression
Created model : LassoRegression
Created model : ElasticNetRegression
Created model : DecisionTreeRegressor


In [37]:
# Step 5:
# Now I have to run fit and predict on all the models in regression model mapping
# first I need to create the param grid that is relevant to gridSearchCV

# Let's create the param grid

def create_param_grid(json_data):
  # Here I will define json parameter names mapping to scikit learn parameter names
  param_name_mapping = {
      "min_trees" : "n_estimators",
      "max_trees" : "n_estimators",
      "min_depth" : "max_depth",
      "max_depth" : "max_depth",
      "min_samples_per_leaf_min_value" : "min_samples_leaf",
      "min_samples_per_leaf_max_value" : "min_samples_leaf",
      "min_regparam" : "alpha",
      "max_regparam" : "alpha",
      "min_elasticnet" : "l1_ratio",
      "max_elasticnet" : "l1_ratio",
  }

  # Initilializing empty parameter grid
  param_grid = {}

  # access algorithms in JSON data
  algorithms = json_data["design_state_data"]["algorithms"]

  for algo_name, algo_details in algorithms.items():

    # initialize dict for this model params
    model_params = {}

    for json_param, sklearn_param in param_name_mapping.items():
      if json_param in algo_details:
        # create range of values if applicable
        value = algo_details[json_param]
        if isinstance(value,list) and len(value) == 2:
          model_params[sklearn_param] = range(value[0] , value[1] + 1 )
        else:
          model_params[sklearn_param] = [value]

    if model_params:
      param_grid[algo_name] = model_params

    # condition to remove 'alpha' from linear regression
    if algo_name == "LinearRegression" and "alpha" in param_grid[algo_name]:
      del param_grid[algo_name]["alpha"]

    if algo_name == "LinearRegression" and "l1_ratio" in param_grid[algo_name]:
      del param_grid[algo_name]["l1_ratio"]

  return param_grid


# now lets call the function and create param grid

param_grid = create_param_grid(json_data)

# display param grid
print("\n Param Grid: ")
for model_name, grid in param_grid.items():
  print(f"{model_name} : {grid}")



 Param Grid: 
RandomForestClassifier : {'n_estimators': [30], 'max_depth': [30], 'min_samples_leaf': [50]}
RandomForestRegressor : {'n_estimators': [20], 'max_depth': [25], 'min_samples_leaf': [10]}
GBTClassifier : {'max_depth': [7]}
GBTRegressor : {'max_depth': [7]}
LinearRegression : {}
LogisticRegression : {'alpha': [0.8], 'l1_ratio': [0.8]}
RidgeRegression : {'alpha': [0.8]}
LassoRegression : {'alpha': [0.8]}
ElasticNetRegression : {'alpha': [0.8], 'l1_ratio': [0.8]}
DecisionTreeRegressor : {'max_depth': [7]}
DecisionTreeClassifier : {'max_depth': [7]}
extra_random_trees : {'max_depth': range(12, 46)}


In [38]:
# Step 6:
# Running the grid search CV and getting all the metrics such mean squared error and R2 score
# as we are using regression models only

# importing libraries
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# defining the fit and predict function
def fit_and_predict(models, param_grid, X_train, y_train, X_test, y_test):
  best_model = None
  best_score = float("inf") # this as we want the lowest mse score for best model
  # empty dict to store best metrics
  best_metrics = {}
  # empty dict to store results
  results = {}

  # iterating thorugh all modles in regression model mapping in step 4
  for model_name, model in models.items():
    if model is None:
      continue

    # get the parameter grid for the model
    grid = param_grid.get(model_name, {})

    # perform grid search Cv
    grid_search = GridSearchCV(model, grid,cv=5, scoring = "neg_mean_squared_error", n_jobs = -1)

    # fit the model to training data
    grid_search.fit(X_train, y_train)

    # best model
    best_model_fit = grid_search.best_estimator_

    # best params
    best_params = grid_search.best_params_

    # predict with the best model
    y_pred = best_model_fit.predict(X_test)

    # calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)

    # best r2 score
    best_r2 = best_model_fit.score(X_test, y_test)


    # save results for all the models
    results[model_name] = {
        "Best model" : best_model,
        "Best hyperparameters" : best_params,
        "Mean square error" : mse,
        "R2 score" : best_r2,
    }

    # track best model based on mse
    if mse < best_score:
      best_score = mse
      best_metrics = results[model_name]
      best_model = best_model_fit

  return results, best_metrics, best_model

# now we need to give the data and convert it using train test split

csv_path = '/content/Screening_Test_DS/iris.csv'

# loading the data frame
df = pd.read_csv(csv_path)

# converting the non-numerical columns
# not imported library as it is already imported above
le = LabelEncoder()
df["species_encoded"] = le.fit_transform(df["species"])

target_data = json_data["design_state_data"]["target"]["target"]
# splitting the data frame into X and y using target
X = df[['sepal_length', 'sepal_width', 'petal_length', 'species_encoded']]
y = df['petal_width']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# call the fit and predict function
results, best_metrics, best_model = fit_and_predict(models, param_grid, X_train, y_train, X_test, y_test)

# display results

print("\n Results for each model: ")

for model_name, metrics in results.items():
  print(f"\n{model_name}")
  for metric, value in metrics.items():
    print(f"{metric} : {value}")

print(f"\n Best model and performance metrics : ")
print(f"Best model : {best_model}")
print(f"Best hyperparameters : {best_metrics['Best hyperparameters']}")
print(f"Mean squared error : {best_metrics['Mean square error']}")
print(f"Best R-squared : {best_metrics['R2 score']}")



 Results for each model: 

RandomForestRegressor
Best model : None
Best hyperparameters : {'max_depth': 25, 'min_samples_leaf': 10, 'n_estimators': 20}
Mean square error : 0.024283591141603122
R2 score : 0.9617975632724872

GBTRegressor
Best model : RandomForestRegressor(max_depth=25, min_samples_leaf=10, n_estimators=20)
Best hyperparameters : {'max_depth': 7}
Mean square error : 0.042704815139070366
R2 score : 0.932817679691721

LinearRegression
Best model : RandomForestRegressor(max_depth=25, min_samples_leaf=10, n_estimators=20)
Best hyperparameters : {}
Mean square error : 0.03011497295063442
R2 score : 0.9526237556056373

RidgeRegression
Best model : RandomForestRegressor(max_depth=25, min_samples_leaf=10, n_estimators=20)
Best hyperparameters : {'alpha': 0.8}
Mean square error : 0.02962541534185668
R2 score : 0.9533939173772116

LassoRegression
Best model : RandomForestRegressor(max_depth=25, min_samples_leaf=10, n_estimators=20)
Best hyperparameters : {'alpha': 0.8}
Mean squar