<h1 style = "color : navy;"> QSAR Model Building & Evaluation_MF</h1>

## Suppress any warnings for all the upcoming cells
- use this function with the "ignore" argument to ignore all warnings

In [1]:
import warnings
warnings.filterwarnings('ignore')

# If there is a specific type of warning that you want to ignore, you can specify this type in the filterwarnings function
warnings.filterwarnings('ignore', category=UserWarning) # You would replace UserWarning with the specific warning class
                                                        # you wish to ignore.

# For the case of ignoring warnings from specific libraries, you can add the module parameter:
warnings.filterwarnings('ignore', module='numpy')  # Ignore warnings from numpy


<h1 style = " color : red ;" > Required library</h1>

In [1]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

<h1 style="color:red;"> Machine Learning Model Building and Testing  </h1>

<h2 style="color:red;">Data Manipulation for machine learning model building</h2>

<h4 style="color:blue;">Read the "_MLMB_df_no_zeros_no_nans4x4.csv" data frame</h4>

In [2]:
# read the ""unique_SMILES_abs_max_zscore_rat.csv"" data frame 
MLMB_df_no_zeros_no_nans4x4 = pd.read_csv("MLMB_df_no_zeros_no_nans4x4.csv")

# print the data frame
print(MLMB_df_no_zeros_no_nans4x4.shape)
MLMB_df_no_zeros_no_nans4x4.head(5)

(2030, 194)


Unnamed: 0,SMILES,zscore,Target,TSSPECIES,SEX,nAtom,nHeavyAtom,nHetero,ATS0dv,ATS1dv,...,SRW06,SRW08,SRW10,TSRW10,MW,AMW,WPath,Zagreb1,Zagreb2,mZagreb2
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,-44.513706,toxic,DOG,M,47,28,8,383.604938,425.111111,...,7.058758,8.686261,10.35917,78.040268,394.130887,8.385764,1873.0,156,189,6.027778
1,COc1cc(N(C)CCN(C)C)c(NC(=O)C=C)cc1Nc2nccc(n2)-...,-40.997577,toxic,MOUSE,M,70,37,9,496.0,522.0,...,7.236339,8.843471,10.503724,87.974766,499.269573,7.132422,4557.0,192,226,8.277778
2,CNC(=O)c1cn(C[C@H](F)CCc2ccc(NC(=O)Cc3cc(OC4CC...,-28.290163,toxic,DOG,F,65,38,14,670.0,600.0,...,7.296413,8.886686,10.515235,88.468481,532.215821,8.187936,6289.0,198,225,8.194444
3,Cn1nc(nc1Nc2ccc3[nH]ncc3c2C4CC4)-c5ccc(cc5)C(=...,-18.0,toxic,DOG,M,53,32,10,520.0,508.0,...,7.167809,8.782783,10.447642,88.51931,437.177565,8.248633,3319.0,176,211,6.861111
4,Cc1cc(Nc2cc(N)ncn2)c(=O)n3c1C(=O)NC34CCCCC4,-15.928106,toxic,DOG,F,45,25,8,364.0,386.0,...,6.99851,8.683555,10.425757,75.161421,340.164774,7.559217,1390.0,140,171,5.347222


In [6]:
# count the unique values in the column TSSPECIES

unique_counts = MLMB_df_no_zeros_no_nans4x4['TSSPECIES'].value_counts()
print("Count of each unique value:")
print(unique_counts)



Count of each unique value:
RAT       871
DOG       546
MOUSE     269
MONKEY    223
PIG        76
RABBIT     41
Monkey      2
Rat         2
Name: TSSPECIES, dtype: int64


In [4]:
# check the data type of a column
print(MLMB_df_no_zeros_no_nans4x4['ATS0dv'].dtype)

float64


In [171]:
# Count the unique values in the "Target" column
target_counts = MLMB_df_no_zeros_no_nans4x4['Target'].value_counts()

# Print the unique value counts
print(target_counts)


toxic         878
mild_toxic    725
non_toxic     427
Name: Target, dtype: int64


<h3 style="color:blue;"> Create new data frame by dropping 'SMILES' & 'zscore' column  </h3>

In [6]:
# Define a new data frame 
df1 = MLMB_df_no_zeros_no_nans4x4 

# Create a new DataFrame without the first two columns
df = df1.iloc[:, 2:] 

print ("The shape of the df:", df.shape)
print(df.shape)
#print(df.head(1))
print(df.iloc[:3, :5])

The shape of the df: (2030, 192)
(2030, 192)
  Target TSSPECIES SEX  nAtom  nHeavyAtom
0  toxic       DOG   M     47          28
1  toxic     MOUSE   M     70          37
2  toxic       DOG   F     65          38


<h3 style="color:magenta;"> 'Target' column encoding by Ordinal encoding </h3>

In [7]:
import category_encoders as ce

# Define an ordinal mapping for your classes
ordinal_mapping = {'non_toxic': 0, 'mild_toxic': 1, 'toxic': 2}

# Initialize encoder
encoder = ce.OrdinalEncoder(mapping=[{'col': 'Target', 'mapping': ordinal_mapping}])

# Apply encoder to 'target' column
df['Target'] = encoder.fit_transform(df['Target'])

# Now 'target' column is ordinal encoded
#print(df['target'])
print ("The shape of the df:", df.shape)
#print (df_no_species.head(3))
print(df.iloc[:3, :3])


The shape of the df: (2030, 192)
   Target TSSPECIES SEX
0       2       DOG   M
1       2     MOUSE   M
2       2       DOG   F


<h3 style="color:magenta;"> 'TSSPECIES' &  'SEX'column encoding by onehot encoding </h3>

In [8]:
# Perform one-hot encoding for 'TSSPECIES' and 'SEX'
df_encoded = pd.get_dummies(df, columns=['TSSPECIES', 'SEX'])

# Print the updated dataframe
print(df_encoded.shape)
df_encoded.head(3)

(2030, 200)


Unnamed: 0,Target,nAtom,nHeavyAtom,nHetero,ATS0dv,ATS1dv,ATS0d,ATS1d,ATS0Z,ATS1Z,...,TSSPECIES_DOG,TSSPECIES_MONKEY,TSSPECIES_MOUSE,TSSPECIES_Monkey,TSSPECIES_PIG,TSSPECIES_RABBIT,TSSPECIES_RAT,TSSPECIES_Rat,SEX_F,SEX_M
0,2,47,28,8,383.604938,425.111111,175,224,1386,1437,...,1,0,0,0,0,0,0,0,0,1
1,2,70,37,9,496.0,522.0,225,275,1512,1778,...,0,0,1,0,0,0,0,0,0,1
2,2,65,38,14,670.0,600.0,225,275,1718,1847,...,1,0,0,0,0,0,0,0,1,0


<h3 style="color:blue;"> "Model  Data frame" and "Validation Data frame" generation </h3>
    <ul style = " color : magenta;">
        <li> Select few rows from each "unique target class"</li>
        <li>DataFrame.sample() function to randomly sample rows from the DataFrame</li>
    </ul>

In [172]:
# Validation data separation from the data frame
# Create a DataFrame of 'non_toxic' rows

df_non_toxic = df_encoded[df_encoded['Target'] == 0]

# Create a DataFrame of 'toxic' rows
df_mild_toxic = df_encoded[df_encoded['Target'] == 1]

# Create a DataFrame of 'toxic' rows
df_toxic = df_encoded[df_encoded['Target'] == 2]

# Randomly sample 9 'non_toxic' rows and 1 'toxic' row
df2_validation = pd.concat([df_non_toxic.sample(7, random_state=42), df_mild_toxic.sample(15, random_state=42),df_toxic.sample(28, random_state=42)])

# Create df2 as all rows from df not in df1
df1_model = df_encoded.drop(df2_validation.index)

print("Model DataFrame:", df1_model.shape )
#print(df1_model)
print(df1_model.iloc[:3, :6])

print("Validation DataFrame:", df2_validation.shape )
#print(df1_model)
print(df2_validation.iloc[:3, :6])

Model DataFrame: (1980, 200)
   Target  nAtom  nHeavyAtom  nHetero      ATS0dv      ATS1dv
0       2     47          28        8  383.604938  425.111111
1       2     70          37        9  496.000000  522.000000
2       2     65          38       14  670.000000  600.000000
Validation DataFrame: (50, 200)
      Target  nAtom  nHeavyAtom  nHetero      ATS0dv      ATS1dv
2022       0     58          27        4  254.790123  266.000000
1678       0     66          43       15  693.209876  689.222222
1780       0     51          33       10  521.604938  537.111111


In [166]:
# if 'TSSPECIES_MOUSE' in df1_model.columns:
#     print("Column 'TSSPECIES_MOUSE' is present in df_model.")
# else:
#     print("Column 'TSSPECIES_MOUSE' is not present in df_model.")


<h1 style="color:red;"> Different Machine Learning Model building and testing :  </h1>

<h3 style="color:blue;">List of Several Mchine Learning Model Built and Tested:</h3>
<ol style="color:magenta;">
    <li>DummyClassifier</li>
    <li>DummyClassifier</li>
    <li>RandomForestClassifier</li>
    <li>GradientBoostingClassifier</li>
    <li>SVC</li>
    <li>KNeighborsClassifier</li>
    <li>DecisionTreeClassifier(</li>
    <li>LogisticRegression</li>
    <li>xgb.XGBClassifier</li>
    <li>AdaBoostClassifier</li>
    <li>GaussianNB</li>
    <li>MLPClassifier</li>
    <li>LGBMClassifier</li>
    <li>CatBoostClassifier</li>
    <li>ExtraTreesClassifier</li>    
</ol>

<h2 style="color:blue;">......................"Code" for Combined Machine Learning Models .........</h2>

<h3 style="color:brown;"> Multiple Models built and tested Combinedly :  </h3>

In [101]:

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models
models = [
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    SVC(random_state=42),
    KNeighborsClassifier(n_neighbors=5),
    DecisionTreeClassifier(random_state=42),
    LogisticRegression(random_state=42),
    xgb.XGBClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    MLPClassifier(random_state=42),
    LGBMClassifier(random_state=42),
    CatBoostClassifier(random_state=42, verbose=False),
    ExtraTreesClassifier(random_state=42)
]

# Initialize empty lists for storing the results
model_names = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

# Iterate over the models
for model in models:
    # Create and train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']
    confusion_mat = confusion_matrix(y_test, y_pred)

    # Append the results to the lists
    model_names.append(type(model).__name__)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)
    confusion_matrices.append(confusion_mat)

# Create a DataFrame from the results
results_df = pd.DataFrame({'Model': model_names, 'Precision': precisions,
                           'Recall': recalls, 'F1-score': f1_scores,
                           'Confusion Matrix': confusion_matrices})

# Print the results DataFrame
#print(results_df)


In [174]:
results_df

Unnamed: 0,Model,Precision,Recall,F1-score,Confusion Matrix
0,RandomForestClassifier,0.488688,0.487683,0.488064,"[[30, 32, 21], [34, 65, 44], [15, 45, 110]]"
1,GradientBoostingClassifier,0.452532,0.447546,0.447236,"[[23, 28, 32], [30, 59, 54], [12, 47, 111]]"
2,SVC,0.28853,0.345382,0.224965,"[[3, 0, 80], [4, 0, 139], [0, 0, 170]]"
3,KNeighborsClassifier,0.399293,0.401439,0.398865,"[[28, 24, 31], [41, 55, 47], [34, 54, 82]]"
4,DecisionTreeClassifier,0.415574,0.415142,0.415183,"[[25, 30, 28], [36, 61, 46], [24, 58, 88]]"
5,LogisticRegression,0.253605,0.333428,0.206686,"[[1, 0, 82], [0, 0, 143], [2, 0, 168]]"
6,XGBClassifier,0.501956,0.500188,0.50083,"[[30, 33, 20], [34, 67, 42], [13, 43, 114]]"
7,AdaBoostClassifier,0.438481,0.430545,0.431706,"[[22, 29, 32], [25, 61, 57], [15, 53, 102]]"
8,GaussianNB,0.327189,0.335355,0.240681,"[[5, 69, 9], [8, 126, 9], [8, 151, 11]]"
9,MLPClassifier,0.384153,0.359126,0.32427,"[[9, 9, 65], [13, 25, 105], [8, 27, 135]]"


<h2 style="color:red;"> Combined Model validation </h2>
<ul style="color:blue;"> 
    <li>Prediction on the selected validation data for all Models </LI>
   </ul>

In [408]:
# Combined Model validation code 

# Detailed Format

# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# List of models
models = [
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    SVC(random_state=42),
    KNeighborsClassifier(n_neighbors=5),
    DecisionTreeClassifier(random_state=42),
    LogisticRegression(random_state=42),
    xgb.XGBClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    MLPClassifier(random_state=42),
    LGBMClassifier(random_state=42),
    CatBoostClassifier(random_state=42, verbose=False),
    ExtraTreesClassifier(random_state=42)
]

# Initialize an empty DataFrame for storing the results
results_dfs = []

# Iterate over the models
for model in models:
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = model.predict(X_val)

    # Convert the arrays to 1-dimensional if needed
    y_val = y_val.ravel()
    y_val_pred = y_val_pred.ravel()

    # Create a dataframe with 'y_val' and 'y_val_pred' as columns
    model_results = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})
    
    # Add 'comment' column based on the condition
    model_results['Prediction'] = ['No' if y_true != y_pred else 'Yes' for y_true, y_pred in zip(y_val, y_val_pred)]
    
    # Set the model name as the index of the model_results DataFrame
    model_results.index = [type(model).__name__] * len(model_results)

    # Append the results DataFrame to the list
    results_dfs.append(model_results) # concatenates  multiple dataf frmaes into a single data frame
    
# pd.concat(results_dfs) is concatenating the list of DataFrames 'results_dfs' into a single DataFrame 'results_df_val'   
# The concat() method takes a list of DataFrames as its input and concatenates them into a single DataFrame.

# Concatenate the results DataFrames into a single DataFrame
results_df_val = pd.concat(results_dfs)


# Reset index and rename the column as "Models_name"
results_df_val.reset_index(inplace=True)
results_df_val.rename(columns={'index': 'Models_name'}, inplace=True)


# Print the combined results DataFrame
print('results_df_val :')
print()
print(results_df_val)
print()


# Create a new variabe 

results_df_val1 = results_df_val
#print(results_df_val1)


# creation of the Summary data frame [Method-1]

# value_counts provides more flexibility and allows additional operations such as renaming columns and resetting the index.
new_df_summary = results_df_val1.groupby(['Models_name', 'y_val'])['Prediction'].value_counts().unstack(fill_value=0)

# Rename the columns
#new_df_summary.rename(columns={'No': 'y_val_pred_NO', 'Yes': 'y_val_pred_Yes'}, inplace=True)

# Reset the index
#new_df_summary.reset_index(inplace=True)

# Print the data frame
print('new_df_summary : ')
print()
print(new_df_summary)
print()

## Grouping for No/Yes Summing
# Group by Models_name and sum the values in No and Yes columns
new_df_grouped = new_df_summary.groupby('Models_name')['No', 'Yes'].sum()

# Rename the columns
#new_df_grouped.rename(columns={' y_val_pred_NO': 'Sum_NO', 'y_val_pred_Yes': 'Sum_Yes'}, inplace=True)

# Reset the index to make 'Models_name' a regular column
#new_df_grouped.reset_index(inplace=True)

# Print the new DataFrame
print('new_df_grouped :')
print()
print(new_df_grouped)
print()

# # merging data frames
# final_results = pd.merge(new_df_summary, new_df_grouped, on='Models_name', how = 'inner')
# final_results


results_df_val :

                Models_name  y_val  y_val_pred Prediction
0    RandomForestClassifier      0           1         No
1    RandomForestClassifier      0           1         No
2    RandomForestClassifier      0           0        Yes
3    RandomForestClassifier      0           2         No
4    RandomForestClassifier      0           0        Yes
..                      ...    ...         ...        ...
645    ExtraTreesClassifier      2           2        Yes
646    ExtraTreesClassifier      2           2        Yes
647    ExtraTreesClassifier      2           2        Yes
648    ExtraTreesClassifier      2           2        Yes
649    ExtraTreesClassifier      2           2        Yes

[650 rows x 4 columns]

new_df_summary : 

Prediction                        No  Yes
Models_name                y_val         
AdaBoostClassifier         0       5    2
                           1      10    5
                           2       9   19
CatBoostClassifier         0    

In [128]:
# # Set the display option to show all rows
# pd.set_option('display.max_rows', None)

# # Print the DataFrame
# print(results_df_val)

# # Reset the display option to the default value
# pd.reset_option('display.max_rows')



<h2 style="color:blue;">......................Individual Machine Learning Models Building and Testing.........</h2>

<h2 style="color:red;"> Model-DummyClassifier : (strategy='most_frequent')  </h2>

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection 
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the DummyClassifier
dummy_model = DummyClassifier(strategy='most_frequent')

# Fit the model to your data
dummy_model.fit(X_train, y_train)

# Predict on the test set
y_pred = dummy_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        83
           1       0.00      0.00      0.00       143
           2       0.43      1.00      0.60       170

    accuracy                           0.43       396
   macro avg       0.14      0.33      0.20       396
weighted avg       0.18      0.43      0.26       396

Confusion Matrix:
[[  0   0  83]
 [  0   0 143]
 [  0   0 170]]


<h2 style="color:red;"> Model-DummyClassifier : strategies = ['stratified', 'most_frequent', 'prior', 'uniform'] </h2>

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DummyClassifier with different strategies
strategies = ['stratified', 'most_frequent', 'prior', 'uniform']
for strategy in strategies:
    dummy_model = DummyClassifier(strategy=strategy)
    dummy_model.fit(X_train, y_train)
    y_pred = dummy_model.predict(X_test)
    print(f"Strategy: {strategy}")
    print(classification_report(y_test, y_pred))
    print()
    print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Strategy: stratified
              precision    recall  f1-score   support

           0       0.19      0.20      0.20        83
           1       0.33      0.31      0.32       143
           2       0.40      0.41      0.41       170

    accuracy                           0.33       396
   macro avg       0.31      0.31      0.31       396
weighted avg       0.33      0.33      0.33       396


Confusion Matrix:
Strategy: most_frequent
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        83
           1       0.00      0.00      0.00       143
           2       0.43      1.00      0.60       170

    accuracy                           0.43       396
   macro avg       0.14      0.33      0.20       396
weighted avg       0.18      0.43      0.26       396


Confusion Matrix:
Strategy: prior
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        83
           1       0.00      0

<h1 style="color:red;"> Model-1 (RandomForestClassifier) :  </h1>

In [18]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection 
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_RandomForestClassifier = RandomForestClassifier(random_state=42)
model_RandomForestClassifier .fit(X_train, y_train)

# Now you can use the model to predict on your test set
y_pred = model_RandomForestClassifier.predict(X_test)

#  assess the performance of your model  using classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

              precision    recall  f1-score   support

           0       0.38      0.36      0.37        83
           1       0.46      0.45      0.46       143
           2       0.63      0.65      0.64       170

    accuracy                           0.52       396
   macro avg       0.49      0.49      0.49       396
weighted avg       0.51      0.52      0.52       396

Confusion Matrix:
[[ 30  32  21]
 [ 34  65  44]
 [ 15  45 110]]


<h3 style="color:navy;"> Prediction on the selected validation data (Model-RandomForestClassifier) :  </h3>

In [20]:
import pandas as pd
from sklearn.metrics import classification_report

# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_RandomForestClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)


              precision    recall  f1-score   support

           0       0.29      0.29      0.29         7
           1       0.33      0.33      0.33        15
           2       0.61      0.61      0.61        28

    accuracy                           0.48        50
   macro avg       0.41      0.41      0.41        50
weighted avg       0.48      0.48      0.48        50

      y_val  y_val_pred
2022      0           1
1678      0           1
1780      0           0
1633      0           2
1961      0           0
1874      0           2
1758      0           2
1204      1           2
1396      1           2
1432      1           2
911       1           1
1253      1           1
1601      1           0
943       1           2
987       1           2
1132      1           2
1365      1           0
1239      1           1
950       1           1
1274      1           2
941       1           2
1505      1           1
331       2           0
247       2           1
789       2        

<h3 style="color: blue;"> StratifiedKFold for cross-validation (Model-RandomForestClassifier) :  </h3>

In [22]:
from sklearn.model_selection import StratifiedKFold

# Define the number of folds for cross-validation
n_splits = 10

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=n_splits)

# Perform cross-validation
for train_index, test_index in skf.split(X, y):
    # Split the data into train and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the performance of the model
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-----------------------")
    

              precision    recall  f1-score   support

           0       0.35      0.21      0.26        42
           1       0.41      0.38      0.39        71
           2       0.58      0.72      0.64        85

    accuracy                           0.49       198
   macro avg       0.44      0.44      0.43       198
weighted avg       0.47      0.49      0.47       198

Confusion Matrix:
[[ 9 21 12]
 [11 27 33]
 [ 6 18 61]]
-----------------------
              precision    recall  f1-score   support

           0       0.38      0.31      0.34        42
           1       0.32      0.30      0.31        71
           2       0.58      0.67      0.62        85

    accuracy                           0.46       198
   macro avg       0.43      0.43      0.42       198
weighted avg       0.44      0.46      0.45       198

Confusion Matrix:
[[13 23  6]
 [15 21 35]
 [ 6 22 57]]
-----------------------
              precision    recall  f1-score   support

           0       0.47  

<h1 style="color:red;"> Model-2 : GradientBoostingClassifier  </h1>

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Gradient Boosting model
model_GradientBoostingClassifier = GradientBoostingClassifier(random_state=42)
model_GradientBoostingClassifier.fit(X_train, y_train)

# Now you can use the model to predict on your test set
y_pred = model_GradientBoostingClassifier.predict(X_test)

# Assess the performance of your model using classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)


              precision    recall  f1-score   support

           0       0.35      0.28      0.31        83
           1       0.44      0.41      0.43       143
           2       0.56      0.65      0.60       170

    accuracy                           0.49       396
   macro avg       0.45      0.45      0.45       396
weighted avg       0.48      0.49      0.48       396

Confusion Matrix:
[[ 23  28  32]
 [ 30  59  54]
 [ 12  47 111]]


# Prediction on the selected validation data 

In [25]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_GradientBoostingClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)



              precision    recall  f1-score   support

           0       0.25      0.29      0.27         7
           1       0.46      0.40      0.43        15
           2       0.66      0.68      0.67        28

    accuracy                           0.54        50
   macro avg       0.46      0.45      0.45        50
weighted avg       0.54      0.54      0.54        50

      y_val  y_val_pred
2022      0           2
1678      0           1
1780      0           0
1633      0           2
1961      0           0
1874      0           2
1758      0           1
1204      1           2
1396      1           2
1432      1           2
911       1           1
1253      1           1
1601      1           0
943       1           2
987       1           2
1132      1           2
1365      1           0
1239      1           1
950       1           1
1274      1           1
941       1           2
1505      1           1
331       2           0
247       2           1
789       2        

<h1 style="color:red;"> Model-3 : Support Vector Machine (SVM)   </h1>

In [26]:
from sklearn.svm import SVC

# Target and feature selection 
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVM model
model_SVM = SVC(random_state=42)

# Fit the model to the training data
model_SVM.fit(X_train, y_train)

# Now you can use the model to predict on your test set
y_pred = model_SVM.predict(X_test)

#  Assess the performance of your model using classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)


              precision    recall  f1-score   support

           0       0.43      0.04      0.07        83
           1       0.00      0.00      0.00       143
           2       0.44      1.00      0.61       170

    accuracy                           0.44       396
   macro avg       0.29      0.35      0.22       396
weighted avg       0.28      0.44      0.28       396

Confusion Matrix:
[[  3   0  80]
 [  4   0 139]
 [  0   0 170]]


# Prediction on the selected validation data

In [27]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred =  model_SVM.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.00      0.00      0.00        15
           2       0.56      1.00      0.72        28

    accuracy                           0.56        50
   macro avg       0.19      0.33      0.24        50
weighted avg       0.31      0.56      0.40        50

      y_val  y_val_pred
2022      0           2
1678      0           2
1780      0           2
1633      0           2
1961      0           2
1874      0           2
1758      0           2
1204      1           2
1396      1           2
1432      1           2
911       1           2
1253      1           2
1601      1           2
943       1           2
987       1           2
1132      1           2
1365      1           2
1239      1           2
950       1           2
1274      1           2
941       1           2
1505      1           2
331       2           2
247       2           2
789       2        

<h1 style="color:red;"> Model-4 : KNeighborsClassifier  </h1>

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_KNeighborsClassifier = KNeighborsClassifier(n_neighbors=5)
model_KNeighborsClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_KNeighborsClassifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.27      0.34      0.30        83
           1       0.41      0.38      0.40       143
           2       0.51      0.48      0.50       170

    accuracy                           0.42       396
   macro avg       0.40      0.40      0.40       396
weighted avg       0.43      0.42      0.42       396

Confusion Matrix:
[[28 24 31]
 [41 55 47]
 [34 54 82]]


# Prediction on the selected validation data

In [29]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred =  model_KNeighborsClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)


              precision    recall  f1-score   support

           0       0.07      0.14      0.10         7
           1       0.33      0.27      0.30        15
           2       0.58      0.50      0.54        28

    accuracy                           0.38        50
   macro avg       0.33      0.30      0.31        50
weighted avg       0.44      0.38      0.40        50

      y_val  y_val_pred
2022      0           1
1678      0           1
1780      0           0
1633      0           2
1961      0           2
1874      0           2
1758      0           2
1204      1           2
1396      1           2
1432      1           2
911       1           2
1253      1           0
1601      1           0
943       1           2
987       1           2
1132      1           0
1365      1           0
1239      1           1
950       1           1
1274      1           0
941       1           1
1505      1           1
331       2           2
247       2           0
789       2        

<h1 style="color:red;"> Model-5 : DecisionTreeClassifier </h1>

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_DecisionTreeClassifier = DecisionTreeClassifier(random_state=42)
model_DecisionTreeClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_DecisionTreeClassifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.29      0.30      0.30        83
           1       0.41      0.43      0.42       143
           2       0.54      0.52      0.53       170

    accuracy                           0.44       396
   macro avg       0.42      0.42      0.42       396
weighted avg       0.44      0.44      0.44       396

Confusion Matrix:
[[25 30 28]
 [36 61 46]
 [24 58 88]]


# Prediction on the selected validation data

In [31]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_DecisionTreeClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.20      0.29      0.24         7
           1       0.35      0.40      0.38        15
           2       0.65      0.54      0.59        28

    accuracy                           0.46        50
   macro avg       0.40      0.41      0.40        50
weighted avg       0.50      0.46      0.47        50

      y_val  y_val_pred
2022      0           1
1678      0           2
1780      0           1
1633      0           2
1961      0           0
1874      0           0
1758      0           2
1204      1           2
1396      1           2
1432      1           1
911       1           1
1253      1           1
1601      1           1
943       1           2
987       1           1
1132      1           1
1365      1           0
1239      1           2
950       1           2
1274      1           0
941       1           0
1505      1           0
331       2           1
247       2           0
789       2        

<h1 style="color:red;"> Model-6 : LogisticRegression  </h1>

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_LogisticRegression = LogisticRegression(random_state=42)
model_LogisticRegression.fit(X_train, y_train)

# Predict on the test set
y_pred = model_LogisticRegression.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.33      0.01      0.02        83
           1       0.00      0.00      0.00       143
           2       0.43      0.99      0.60       170

    accuracy                           0.43       396
   macro avg       0.25      0.33      0.21       396
weighted avg       0.25      0.43      0.26       396

Confusion Matrix:
[[  1   0  82]
 [  0   0 143]
 [  2   0 168]]


# Prediction on the selected validation data

In [33]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_LogisticRegression.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.00      0.00      0.00        15
           2       0.56      1.00      0.72        28

    accuracy                           0.56        50
   macro avg       0.19      0.33      0.24        50
weighted avg       0.31      0.56      0.40        50

      y_val  y_val_pred
2022      0           2
1678      0           2
1780      0           2
1633      0           2
1961      0           2
1874      0           2
1758      0           2
1204      1           2
1396      1           2
1432      1           2
911       1           2
1253      1           2
1601      1           2
943       1           2
987       1           2
1132      1           2
1365      1           2
1239      1           2
950       1           2
1274      1           2
941       1           2
1505      1           2
331       2           2
247       2           2
789       2        

<h1 style="color:red;"> Model-7 : XGBoost (Extreme Gradient Boosting)  </h1>

In [34]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_XGBoost = xgb.XGBClassifier(random_state=42)
model_XGBoost.fit(X_train, y_train)

# Predict on the test set
y_pred = model_XGBoost.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.39      0.36      0.38        83
           1       0.47      0.47      0.47       143
           2       0.65      0.67      0.66       170

    accuracy                           0.53       396
   macro avg       0.50      0.50      0.50       396
weighted avg       0.53      0.53      0.53       396

Confusion Matrix:
[[ 30  33  20]
 [ 34  67  42]
 [ 13  43 114]]


# Prediction on the selected validation data

In [36]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_XGBoost.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.17      0.14      0.15         7
           1       0.32      0.40      0.35        15
           2       0.60      0.54      0.57        28

    accuracy                           0.44        50
   macro avg       0.36      0.36      0.36        50
weighted avg       0.45      0.44      0.44        50

      y_val  y_val_pred
2022      0           1
1678      0           1
1780      0           1
1633      0           2
1961      0           0
1874      0           2
1758      0           2
1204      1           2
1396      1           2
1432      1           2
911       1           1
1253      1           1
1601      1           0
943       1           2
987       1           2
1132      1           2
1365      1           0
1239      1           1
950       1           1
1274      1           1
941       1           2
1505      1           1
331       2           0
247       2           1
789       2        

<h1 style="color:red;"> Model-8 : AdaBoostClassifier </h1>

In [37]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_AdaBoostClassifier = AdaBoostClassifier(random_state=42)
model_AdaBoostClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_AdaBoostClassifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.35      0.27      0.30        83
           1       0.43      0.43      0.43       143
           2       0.53      0.60      0.57       170

    accuracy                           0.47       396
   macro avg       0.44      0.43      0.43       396
weighted avg       0.46      0.47      0.46       396

Confusion Matrix:
[[ 22  29  32]
 [ 25  61  57]
 [ 15  53 102]]


# Prediction on the selected validation data

In [38]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_AdaBoostClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.22      0.29      0.25         7
           1       0.45      0.33      0.38        15
           2       0.63      0.68      0.66        28

    accuracy                           0.52        50
   macro avg       0.44      0.43      0.43        50
weighted avg       0.52      0.52      0.52        50

      y_val  y_val_pred
2022      0           2
1678      0           2
1780      0           0
1633      0           2
1961      0           0
1874      0           1
1758      0           2
1204      1           2
1396      1           1
1432      1           1
911       1           0
1253      1           2
1601      1           1
943       1           0
987       1           2
1132      1           2
1365      1           0
1239      1           1
950       1           2
1274      1           1
941       1           2
1505      1           2
331       2           0
247       2           1
789       2        

<h1 style="color:red;"> Model-9 : Gaussian Naive Bayes </h1>

In [39]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_GaussianNB = GaussianNB()
model_GaussianNB.fit(X_train, y_train)

# Predict on the test set
y_pred = model_GaussianNB.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.24      0.06      0.10        83
           1       0.36      0.88      0.52       143
           2       0.38      0.06      0.11       170

    accuracy                           0.36       396
   macro avg       0.33      0.34      0.24       396
weighted avg       0.34      0.36      0.25       396

Confusion Matrix:
[[  5  69   9]
 [  8 126   9]
 [  8 151  11]]


# Prediction on the selected validation data

In [41]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_GaussianNB.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.28      0.87      0.43        15
           2       0.67      0.07      0.13        28

    accuracy                           0.30        50
   macro avg       0.32      0.31      0.19        50
weighted avg       0.46      0.30      0.20        50

      y_val  y_val_pred
2022      0           1
1678      0           1
1780      0           1
1633      0           1
1961      0           1
1874      0           1
1758      0           1
1204      1           1
1396      1           1
1432      1           2
911       1           1
1253      1           1
1601      1           1
943       1           1
987       1           1
1132      1           1
1365      1           1
1239      1           1
950       1           1
1274      1           1
941       1           1
1505      1           0
331       2           2
247       2           1
789       2        

<h1 style="color:red;"> Model-10 : MLPClassifier </h1>

In [409]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_MLPClassifier = MLPClassifier(random_state=42)
model_MLPClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_MLPClassifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.30      0.11      0.16        83
           1       0.41      0.17      0.25       143
           2       0.44      0.79      0.57       170

    accuracy                           0.43       396
   macro avg       0.38      0.36      0.32       396
weighted avg       0.40      0.43      0.37       396

Confusion Matrix:
[[  9   9  65]
 [ 13  25 105]
 [  8  27 135]]


# Prediction on the selected validation data

In [410]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_MLPClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.40      0.13      0.20        15
           2       0.56      0.86      0.68        28

    accuracy                           0.52        50
   macro avg       0.32      0.33      0.29        50
weighted avg       0.43      0.52      0.44        50

      y_val  y_val_pred
2022      0           1
1678      0           2
1780      0           2
1633      0           2
1961      0           2
1874      0           2
1758      0           2
1204      1           2
1396      1           2
1432      1           2
911       1           2
1253      1           2
1601      1           2
943       1           2
987       1           2
1132      1           2
1365      1           2
1239      1           2
950       1           1
1274      1           2
941       1           2
1505      1           1
331       2           0
247       2           2
789       2        

<h2 style="color:red;"> Model-11 : LightGBM Classifier: LightGBM is a gradient boosting framework that uses tree-based learning algorithms. </h2>

In [44]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_LGBMClassifier = LGBMClassifier(random_state=42)
model_LGBMClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_LGBMClassifier.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.39      0.37      0.38        83
           1       0.44      0.41      0.42       143
           2       0.62      0.66      0.64       170

    accuracy                           0.51       396
   macro avg       0.48      0.48      0.48       396
weighted avg       0.51      0.51      0.51       396

Confusion Matrix:
[[ 31  33  19]
 [ 35  59  49]
 [ 14  43 113]]


# Prediction on the selected validation data

In [45]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_LGBMClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.17      0.14      0.15         7
           1       0.37      0.47      0.41        15
           2       0.64      0.57      0.60        28

    accuracy                           0.48        50
   macro avg       0.39      0.39      0.39        50
weighted avg       0.49      0.48      0.48        50

      y_val  y_val_pred
2022      0           1
1678      0           1
1780      0           1
1633      0           2
1961      0           0
1874      0           2
1758      0           2
1204      1           2
1396      1           2
1432      1           2
911       1           1
1253      1           1
1601      1           0
943       1           2
987       1           2
1132      1           2
1365      1           0
1239      1           1
950       1           1
1274      1           1
941       1           1
1505      1           1
331       2           0
247       2           1
789       2        

<h2 style="color:red;"> Model-12 : CatBoost Classifier: CatBoost is a gradient boosting algorithm that handles categorical features efficiently. </h2>

In [46]:
#!pip install catboost
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_CatBoostClassifier = CatBoostClassifier(random_state=42)
model_CatBoostClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_CatBoostClassifier.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Learning rate set to 0.081005
0:	learn: 1.0861618	total: 175ms	remaining: 2m 54s
1:	learn: 1.0764879	total: 210ms	remaining: 1m 45s
2:	learn: 1.0652338	total: 249ms	remaining: 1m 22s
3:	learn: 1.0569402	total: 284ms	remaining: 1m 10s
4:	learn: 1.0487255	total: 320ms	remaining: 1m 3s
5:	learn: 1.0400183	total: 356ms	remaining: 59s
6:	learn: 1.0345802	total: 392ms	remaining: 55.7s
7:	learn: 1.0304727	total: 428ms	remaining: 53.1s
8:	learn: 1.0231798	total: 465ms	remaining: 51.2s
9:	learn: 1.0192997	total: 499ms	remaining: 49.4s
10:	learn: 1.0155998	total: 535ms	remaining: 48.1s
11:	learn: 1.0092958	total: 569ms	remaining: 46.9s
12:	learn: 1.0042684	total: 604ms	remaining: 45.9s
13:	learn: 0.9983388	total: 639ms	remaining: 45s
14:	learn: 0.9927882	total: 676ms	remaining: 44.4s
15:	learn: 0.9897743	total: 710ms	remaining: 43.7s
16:	learn: 0.9853474	total: 744ms	remaining: 43s
17:	learn: 0.9828347	total: 779ms	remaining: 42.5s
18:	learn: 0.9804721	total: 813ms	remaining: 42s
19:	learn: 0.97

160:	learn: 0.7183129	total: 5.99s	remaining: 31.2s
161:	learn: 0.7170219	total: 6.02s	remaining: 31.2s
162:	learn: 0.7153932	total: 6.06s	remaining: 31.1s
163:	learn: 0.7147523	total: 6.09s	remaining: 31s
164:	learn: 0.7135832	total: 6.12s	remaining: 31s
165:	learn: 0.7129657	total: 6.16s	remaining: 30.9s
166:	learn: 0.7117811	total: 6.2s	remaining: 30.9s
167:	learn: 0.7103035	total: 6.23s	remaining: 30.9s
168:	learn: 0.7087288	total: 6.27s	remaining: 30.8s
169:	learn: 0.7067649	total: 6.31s	remaining: 30.8s
170:	learn: 0.7051721	total: 6.34s	remaining: 30.8s
171:	learn: 0.7042717	total: 6.38s	remaining: 30.7s
172:	learn: 0.7023815	total: 6.42s	remaining: 30.7s
173:	learn: 0.7011137	total: 6.45s	remaining: 30.6s
174:	learn: 0.6994115	total: 6.49s	remaining: 30.6s
175:	learn: 0.6984096	total: 6.53s	remaining: 30.6s
176:	learn: 0.6968316	total: 6.56s	remaining: 30.5s
177:	learn: 0.6953114	total: 6.6s	remaining: 30.5s
178:	learn: 0.6937538	total: 6.64s	remaining: 30.4s
179:	learn: 0.6920

320:	learn: 0.5409535	total: 11.6s	remaining: 24.5s
321:	learn: 0.5401629	total: 11.6s	remaining: 24.5s
322:	learn: 0.5386890	total: 11.6s	remaining: 24.4s
323:	learn: 0.5381836	total: 11.7s	remaining: 24.4s
324:	learn: 0.5374836	total: 11.7s	remaining: 24.3s
325:	learn: 0.5368025	total: 11.8s	remaining: 24.3s
326:	learn: 0.5356891	total: 11.8s	remaining: 24.3s
327:	learn: 0.5349100	total: 11.8s	remaining: 24.2s
328:	learn: 0.5343611	total: 11.9s	remaining: 24.2s
329:	learn: 0.5333845	total: 11.9s	remaining: 24.1s
330:	learn: 0.5327783	total: 11.9s	remaining: 24.1s
331:	learn: 0.5322757	total: 12s	remaining: 24.1s
332:	learn: 0.5311456	total: 12s	remaining: 24s
333:	learn: 0.5301208	total: 12s	remaining: 24s
334:	learn: 0.5291172	total: 12.1s	remaining: 24s
335:	learn: 0.5283950	total: 12.1s	remaining: 23.9s
336:	learn: 0.5274624	total: 12.1s	remaining: 23.9s
337:	learn: 0.5270191	total: 12.2s	remaining: 23.9s
338:	learn: 0.5255212	total: 12.2s	remaining: 23.8s
339:	learn: 0.5251478	to

484:	learn: 0.4162913	total: 17.9s	remaining: 19s
485:	learn: 0.4157114	total: 18s	remaining: 19s
486:	learn: 0.4150043	total: 18s	remaining: 19s
487:	learn: 0.4145747	total: 18s	remaining: 18.9s
488:	learn: 0.4141538	total: 18.1s	remaining: 18.9s
489:	learn: 0.4138340	total: 18.1s	remaining: 18.9s
490:	learn: 0.4133025	total: 18.2s	remaining: 18.8s
491:	learn: 0.4128393	total: 18.2s	remaining: 18.8s
492:	learn: 0.4121155	total: 18.2s	remaining: 18.7s
493:	learn: 0.4115168	total: 18.3s	remaining: 18.7s
494:	learn: 0.4112527	total: 18.3s	remaining: 18.7s
495:	learn: 0.4107155	total: 18.3s	remaining: 18.6s
496:	learn: 0.4099963	total: 18.4s	remaining: 18.6s
497:	learn: 0.4097265	total: 18.4s	remaining: 18.5s
498:	learn: 0.4094560	total: 18.4s	remaining: 18.5s
499:	learn: 0.4092081	total: 18.5s	remaining: 18.5s
500:	learn: 0.4081869	total: 18.5s	remaining: 18.4s
501:	learn: 0.4075945	total: 18.5s	remaining: 18.4s
502:	learn: 0.4070388	total: 18.6s	remaining: 18.3s
503:	learn: 0.4067943	to

648:	learn: 0.3389095	total: 24.3s	remaining: 13.1s
649:	learn: 0.3384103	total: 24.3s	remaining: 13.1s
650:	learn: 0.3380769	total: 24.4s	remaining: 13.1s
651:	learn: 0.3379897	total: 24.4s	remaining: 13s
652:	learn: 0.3377802	total: 24.5s	remaining: 13s
653:	learn: 0.3371769	total: 24.5s	remaining: 13s
654:	learn: 0.3368958	total: 24.5s	remaining: 12.9s
655:	learn: 0.3364468	total: 24.6s	remaining: 12.9s
656:	learn: 0.3362238	total: 24.6s	remaining: 12.9s
657:	learn: 0.3361471	total: 24.7s	remaining: 12.8s
658:	learn: 0.3357532	total: 24.7s	remaining: 12.8s
659:	learn: 0.3352620	total: 24.7s	remaining: 12.7s
660:	learn: 0.3348582	total: 24.8s	remaining: 12.7s
661:	learn: 0.3345724	total: 24.8s	remaining: 12.7s
662:	learn: 0.3341576	total: 24.9s	remaining: 12.6s
663:	learn: 0.3340580	total: 24.9s	remaining: 12.6s
664:	learn: 0.3334460	total: 24.9s	remaining: 12.6s
665:	learn: 0.3330144	total: 25s	remaining: 12.5s
666:	learn: 0.3326675	total: 25s	remaining: 12.5s
667:	learn: 0.3325699	

811:	learn: 0.2849202	total: 30.4s	remaining: 7.04s
812:	learn: 0.2845629	total: 30.5s	remaining: 7s
813:	learn: 0.2844164	total: 30.5s	remaining: 6.97s
814:	learn: 0.2840921	total: 30.5s	remaining: 6.93s
815:	learn: 0.2837122	total: 30.6s	remaining: 6.9s
816:	learn: 0.2833338	total: 30.6s	remaining: 6.86s
817:	learn: 0.2831194	total: 30.7s	remaining: 6.83s
818:	learn: 0.2827167	total: 30.7s	remaining: 6.79s
819:	learn: 0.2823571	total: 30.8s	remaining: 6.75s
820:	learn: 0.2821000	total: 30.8s	remaining: 6.72s
821:	learn: 0.2816595	total: 30.9s	remaining: 6.68s
822:	learn: 0.2814081	total: 30.9s	remaining: 6.64s
823:	learn: 0.2808027	total: 30.9s	remaining: 6.61s
824:	learn: 0.2804732	total: 31s	remaining: 6.57s
825:	learn: 0.2800281	total: 31s	remaining: 6.53s
826:	learn: 0.2796664	total: 31.1s	remaining: 6.5s
827:	learn: 0.2792931	total: 31.1s	remaining: 6.46s
828:	learn: 0.2789629	total: 31.1s	remaining: 6.42s
829:	learn: 0.2788743	total: 31.2s	remaining: 6.39s
830:	learn: 0.2785204

973:	learn: 0.2423018	total: 36.7s	remaining: 980ms
974:	learn: 0.2419566	total: 36.7s	remaining: 942ms
975:	learn: 0.2416649	total: 36.8s	remaining: 904ms
976:	learn: 0.2415249	total: 36.8s	remaining: 867ms
977:	learn: 0.2413910	total: 36.9s	remaining: 829ms
978:	learn: 0.2413000	total: 36.9s	remaining: 791ms
979:	learn: 0.2411307	total: 36.9s	remaining: 754ms
980:	learn: 0.2408575	total: 37s	remaining: 716ms
981:	learn: 0.2404971	total: 37s	remaining: 678ms
982:	learn: 0.2402055	total: 37s	remaining: 640ms
983:	learn: 0.2400881	total: 37.1s	remaining: 603ms
984:	learn: 0.2399393	total: 37.1s	remaining: 565ms
985:	learn: 0.2396758	total: 37.1s	remaining: 527ms
986:	learn: 0.2391147	total: 37.2s	remaining: 490ms
987:	learn: 0.2388548	total: 37.2s	remaining: 452ms
988:	learn: 0.2384871	total: 37.3s	remaining: 414ms
989:	learn: 0.2382728	total: 37.3s	remaining: 377ms
990:	learn: 0.2379143	total: 37.3s	remaining: 339ms
991:	learn: 0.2376144	total: 37.4s	remaining: 301ms
992:	learn: 0.2371

# Prediction on the selected validation data

In [48]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report



# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_CatBoostClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
#results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})
results_df = pd.DataFrame({'y_val': y_val.values.ravel(), 'y_val_pred': y_val_pred.ravel()})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)


# Assess the performance of the model using classification report
print(classification_report(y_val.values.ravel(), y_val_pred.ravel()))



              precision    recall  f1-score   support

           0       0.17      0.14      0.15         7
           1       0.41      0.47      0.44        15
           2       0.67      0.64      0.65        28

    accuracy                           0.52        50
   macro avg       0.42      0.42      0.42        50
weighted avg       0.52      0.52      0.52        50

    y_val  y_val_pred
0       0           1
1       0           1
2       0           1
3       0           2
4       0           0
5       0           2
6       0           2
7       1           2
8       1           2
9       1           2
10      1           1
11      1           1
12      1           0
13      1           2
14      1           2
15      1           1
16      1           0
17      1           1
18      1           1
19      1           1
20      1           2
21      1           1
22      2           0
23      2           1
24      2           2
25      2           2
26      2           2
27 

<h2 style="color:red;"> Model-13 : LGBMClassifier </h2>

In [50]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_LGBMClassifier = LGBMClassifier(random_state=42)
model_LGBMClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_LGBMClassifier.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))





              precision    recall  f1-score   support

           0       0.39      0.37      0.38        83
           1       0.44      0.41      0.42       143
           2       0.62      0.66      0.64       170

    accuracy                           0.51       396
   macro avg       0.48      0.48      0.48       396
weighted avg       0.51      0.51      0.51       396

Confusion Matrix:
[[ 31  33  19]
 [ 35  59  49]
 [ 14  43 113]]


# Prediction on the selected validation data

In [51]:

# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, you can assess the performance
y_val_true = df2_validation['Target']

# Use the model to predict on the validation set
y_val_pred = model_LGBMClassifier.predict(X_val)


# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val_true.values.ravel(), 'y_val_pred': y_val_pred.ravel()})

# Print out the results dataframe
print(results_df)

# Assess the performance of the model on the validation set using classification report
print(classification_report(y_val_true.values.ravel(), y_val_pred.ravel()))


    y_val  y_val_pred
0       0           1
1       0           1
2       0           1
3       0           2
4       0           0
5       0           2
6       0           2
7       1           2
8       1           2
9       1           2
10      1           1
11      1           1
12      1           0
13      1           2
14      1           2
15      1           2
16      1           0
17      1           1
18      1           1
19      1           1
20      1           1
21      1           1
22      2           0
23      2           1
24      2           2
25      2           1
26      2           1
27      2           2
28      2           2
29      2           1
30      2           1
31      2           2
32      2           2
33      2           0
34      2           0
35      2           2
36      2           1
37      2           2
38      2           2
39      2           2
40      2           1
41      2           1
42      2           2
43      2           2
44      2 

<h2 style="color:red;"> Model-14 : ExtraTreesClassifier </h2>

In [411]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the ExtraTreesClassifier model
model_ExtraTreesClassifier = ExtraTreesClassifier(random_state=42)

# Train the model on the training set
model_ExtraTreesClassifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model_ExtraTreesClassifier.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.41      0.36      0.38        83
           1       0.46      0.45      0.46       143
           2       0.62      0.66      0.64       170

    accuracy                           0.52       396
   macro avg       0.50      0.49      0.49       396
weighted avg       0.52      0.52      0.52       396

              precision    recall  f1-score   support

           0       0.41      0.36      0.38        83
           1       0.46      0.45      0.46       143
           2       0.62      0.66      0.64       170

    accuracy                           0.52       396
   macro avg       0.50      0.49      0.49       396
weighted avg       0.52      0.52      0.52       396

Confusion Matrix:
[[ 30  33  20]
 [ 30  65  48]
 [ 14  44 112]]


In [54]:
# Define X_val
X_val = df2_validation.drop(columns='Target')

# If you have the true target values for your validation set, assign them to y_val
y_val = df2_validation['Target']

# Use the model to predict on validation set
y_val_pred = model_ExtraTreesClassifier.predict(X_val)

# Create a dataframe with 'y_val' and 'y_val_pred' as columns
results_df = pd.DataFrame({'y_val': y_val, 'y_val_pred': y_val_pred})

# Assess the performance if you have the true target values
print(classification_report(y_val, y_val_pred))

# Print out the results dataframe
print(results_df)

              precision    recall  f1-score   support

           0       0.25      0.29      0.27         7
           1       0.33      0.33      0.33        15
           2       0.63      0.61      0.62        28

    accuracy                           0.48        50
   macro avg       0.40      0.41      0.41        50
weighted avg       0.49      0.48      0.48        50

      y_val  y_val_pred
2022      0           2
1678      0           1
1780      0           0
1633      0           2
1961      0           0
1874      0           2
1758      0           1
1204      1           2
1396      1           2
1432      1           2
911       1           1
1253      1           1
1601      1           0
943       1           2
987       1           2
1132      1           2
1365      1           0
1239      1           1
950       1           1
1274      1           0
941       1           2
1505      1           1
331       2           0
247       2           1
789       2        

<h1 style="color:red;"> Automatic ML Model Building Packages </h1>

<h1 style = "color : blue"> LazyPredict

In [226]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier  # Import the required model class

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Print model performance
print(models)

# Get the best performing model based on the accuracy metric
best_model_name = models.index[0]

# Instantiate the best model based on the selected name
best_model = RandomForestClassifier()  # Replace with the appropriate model class and its respective parameters

# Train the best model on the entire dataset
best_model.fit(X, y)

# Define X_val
X_val = df2_validation.drop(columns='Target')

# Use the model to predict on the validation set
y_val_pred = best_model.predict(X_val)

# If you have the true target values for your validation set, you can assess the performance
y_val_true = df2_validation['Target']

# Print classification report
print(classification_report(y_val_true, y_val_pred))


100%|██████████| 29/29 [00:20<00:00,  1.44it/s]


                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
ExtraTreesClassifier               0.52               0.50    None      0.52   
RandomForestClassifier             0.51               0.48    None      0.51   
LabelSpreading                     0.49               0.48    None      0.49   
LabelPropagation                   0.49               0.48    None      0.49   
XGBClassifier                      0.51               0.47    None      0.50   
LGBMClassifier                     0.50               0.47    None      0.50   
NuSVC                              0.47               0.46    None      0.47   
BaggingClassifier                  0.48               0.46    None      0.48   
KNeighborsClassifier               0.45               0.43    None      0.46   
AdaBoostClassifier                 0.47               0.43    None      0.46   
RidgeClassifier                    0.47 

In [230]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier  # Import the required model class

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Print model performance
print(models)

# Get the best performing model based on the accuracy metric
best_model_name = models.index[0]

# Instantiate the best model based on the selected name
best_model = ExtraTreesClassifier()  # Replace with the appropriate model class and its respective parameters

# Train the best model on the entire dataset
best_model.fit(X, y)

# Define X_val
X_val = df2_validation.drop(columns='Target')

# Use the model to predict on the validation set
y_val_pred = best_model.predict(X_val)

# If you have the true target values for your validation set, you can assess the performance
y_val_true = df2_validation['Target']

# Print classification report
print(classification_report(y_val_true, y_val_pred))


100%|██████████| 29/29 [00:20<00:00,  1.43it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
ExtraTreesClassifier               0.52               0.50    None      0.52   
RandomForestClassifier             0.51               0.48    None      0.51   
LabelSpreading                     0.49               0.48    None      0.49   
LabelPropagation                   0.49               0.48    None      0.49   
XGBClassifier                      0.51               0.47    None      0.50   
LGBMClassifier                     0.50               0.47    None      0.50   
NuSVC                              0.47               0.46    None      0.47   
BaggingClassifier                  0.48               0.46    None      0.48   
KNeighborsClassifier               0.45               0.43    None      0.46   
AdaBoostClassifier                 0.47               0.43    None      0.46   
RidgeClassifier                    0.47 




NameError: name 'ExtraTreesClassifier' is not defined

# bagging, boosting, or stacking

<h1 style="color:blue;"> Model-14 : TPOT for automatic ML Model Building </h1>

In [247]:
#!pip install tpot
#!pip install -U scikit-learn
#!pip install tpot==0.11.7

In [246]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tpot import TPOTClassifier

# # Target and feature selection
# X = df1_model.drop(columns='Target')
# y = df1_model['Target']

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create a TPOTClassifier object
# tpot = TPOTClassifier(generations=10, population_size=50, verbosity=2, random_state=42)  # Adjust the parameters as per your requirement

# # Train the model
# tpot.fit(X_train, y_train)

# Predict on the test set
y_pred = tpot.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))


Optimization Progress:   0%|          | 0/550 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.46717645649482886

Generation 2 - Current best internal CV score: 0.47474943097871664

Generation 3 - Current best internal CV score: 0.47474943097871664

Generation 4 - Current best internal CV score: 0.47474943097871664

Generation 5 - Current best internal CV score: 0.47727907998243024

Generation 6 - Current best internal CV score: 0.47727907998243024

Generation 7 - Current best internal CV score: 0.47727907998243024

Generation 8 - Current best internal CV score: 0.47727907998243024

Generation 9 - Current best internal CV score: 0.47727907998243024

Generation 10 - Current best internal CV score: 0.47727907998243024

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=1.0, min_samples_leaf=3, min_samples_split=7, n_estimators=100)
              precision    recall  f1-score   support

           0       0.40      0.35      0.37        83
           1       0.45      0.42      0.43     

In [None]:

Generation 1 - Current best internal CV score: 0.46717645649482886

Generation 2 - Current best internal CV score: 0.47474943097871664

Generation 3 - Current best internal CV score: 0.47474943097871664

Generation 4 - Current best internal CV score: 0.47474943097871664

Generation 5 - Current best internal CV score: 0.47727907998243024

Generation 6 - Current best internal CV score: 0.47727907998243024

Generation 7 - Current best internal CV score: 0.47727907998243024

Generation 8 - Current best internal CV score: 0.47727907998243024

Generation 9 - Current best internal CV score: 0.47727907998243024

Generation 10 - Current best internal CV score: 0.47727907998243024

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=1.0, min_samples_leaf=3, min_samples_split=7, n_estimators=100)
              precision    recall  f1-score   support

           0       0.40      0.35      0.37        83
           1       0.45      0.42      0.43       143
           2       0.58      0.65      0.61       170

    accuracy                           0.50       396
   macro avg       0.48      0.47      0.47       396
weighted avg       0.49      0.50      0.50       396

<h1 style="color:blue;"> Model-14 : PyCaret </h1>

In [260]:
!pip install pycaret[full]

Defaulting to user installation because normal site-packages is not writeable
Collecting evidently<0.3,>=0.1.45.dev0
  Downloading evidently-0.2.8-py3-none-any.whl (12.1 MB)
     --------------------------------------- 12.1/12.1 MB 18.2 MB/s eta 0:00:00
Collecting tune-sklearn>=0.2.1
  Downloading tune_sklearn-0.4.6-py3-none-any.whl (41 kB)
     ---------------------------------------- 41.9/41.9 kB ? eta 0:00:00
Collecting scikit-learn-intelex>=2023.0.1
  Downloading scikit_learn_intelex-2023.2.0-py39-none-win_amd64.whl (134 kB)
     ---------------------------------------- 134.5/134.5 kB ? eta 0:00:00
Collecting mlflow<2.0.0,>=1.24.0
  Downloading mlflow-1.30.1-py3-none-any.whl (17.0 MB)
     --------------------------------------- 17.0/17.0 MB 21.8 MB/s eta 0:00:00
Collecting explainerdashboard>=0.3.8
  Downloading explainerdashboard-0.4.2.2-py3-none-any.whl (286 kB)
     ------------------------------------- 286.9/286.9 kB 17.3 MB/s eta 0:00:00
Collecting boto3>=1.24.56
  Downloadin

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.2.2 requires pyqt5<5.13, which is not installed.
spyder 5.2.2 requires pyqtwebengine<5.13, which is not installed.
tensorflow-intel 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.21.6 which is incompatible.


In [259]:
import pandas as pd
from pycaret.classification import *

# Target and feature selection
X = df1_model.drop(columns='Target')
y = df1_model['Target']

# Setup PyCaret environment
setup(data=X, target=y)

try:
    # Compare and select the best model
    best_model = compare_models()

    # Train the best model
    trained_model = tune_model(best_model)

    # Validate the model
    predictions = predict_model(trained_model)

    # Evaluate the model
    evaluate_model(trained_model)

    # Save the model
    save_model(trained_model, 'saved_model')

except ValueError as ve:
    print("An error occurred:", ve)
    print("Please check if the dataset has enough samples or features.")


Unnamed: 0,Description,Value
0,Session id,6333
1,Target,Target
2,Target type,Multiclass
3,Original data shape,"(1980, 200)"
4,Transformed data shape,"(1980, 200)"
5,Transformed train set shape,"(1386, 200)"
6,Transformed test set shape,"(594, 200)"
7,Numeric features,199
8,Preprocess,True
9,Imputation type,simple


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

An error occurred: Estimator [] does not have the required fit() method.
Please check if the dataset has enough samples or features.


<h1 style="color:blue;"> .........................APPENDIX............................ </h1>

<h2 style="color:red;"> Combined Model validation </h2>
<ul style="color:blue;"> 
    <li>Prediction on the selected validation data for all Models </LI>
   </ul>

In [None]:
#...................................................................................................
#...................................................................................................

# below mentioned two setp process (creation of the Summary data frame and Grouping for No/Yes Summing ) 
# can be done by this one step code chunk 

unique_models = results_df_val1['Models_name'].unique()
new_df_summary = pd.DataFrame()

for model in unique_models:
    model_data = results_df_val1[results_df_val1['Models_name'] == model]
    model_summary = model_data['Prediction'].value_counts().reset_index()
    model_summary.columns = ['Prediction', 'Count']
    model_summary['Models_name'] = model
    new_df_summary = pd.concat([new_df_summary, model_summary])

new_df_summary = new_df_summary.pivot(index='Models_name', columns='Prediction', values='Count').fillna(0).reset_index()
new_df_summary

#............................................................................................................
#............................................................................................................


#creation of the Summary data frame [Method-1]

# value_counts provides more flexibility and allows additional operations such as renaming columns and resetting the index.
new_df_summary = results_df_val1.groupby(['Models_name', 'y_val'])['Prediction'].value_counts().unstack(fill_value=0)

# Rename the columns
#new_df_summary.rename(columns={'No': 'y_val_pred_NO', 'Yes': 'y_val_pred_Yes'}, inplace=True)

# Reset the index
new_df_summary.reset_index(inplace=True)

# Print the data frame
print(new_df_summary)

# .............................................................................................
# creation of the Summary1 data frame [Method-2]

# # Rename the columns
# #new_df_summary1.rename(columns={'No': 'y_val_pred_NO', 'Yes': 'y_val_pred_Yes'}, inplace=True)

# # Group by Models, y_val, and comment and count the occurrences

# new_df_summary1 = results_df_val1.groupby(['Models_name', 'y_val', 'Prediction']).size().unstack(fill_value=0)

# # Reset the index
# new_df_summary1.reset_index(inplace=True)

# # print
# new_df_summary1 
#..............................................................................................

# creation of the Summary1 data frame [Method-3]
# # alternative code
# new_df_summary = results_df_val1.pivot_table(index=['Models_name', 'y_val'], columns='Prediction', aggfunc='size', fill_value=0).reset_index()
# #new_df_summary

# creation of the Summary1 data frame [Method-4]
# # another alternative 
# new_df_summary = pd.crosstab(index=[results_df_val1['Models_name'], results_df_val1['y_val']], columns=results_df_val1['Prediction']).reset_index()
# #new_df_summary
#.............................................................................................................

## Grouping for No/Yes Summing
# Group by Models_name and sum the values in No and Yes columns
new_df_grouped = new_df_summary.groupby('Models_name')['No', 'Yes'].sum()

# Rename the columns
#new_df_grouped.rename(columns={' y_val_pred_NO': 'Sum_NO', 'y_val_pred_Yes': 'Sum_Yes'}, inplace=True)

# Reset the index to make 'Models_name' a regular column
#new_df_grouped.reset_index(inplace=True)

# Print the new DataFrame
print(new_df_grouped)


# Combined Model validation code...........................................................................
#..........................................................................................................

# # Simplified Format
#...........................................................................................................

# # Define X_val and y_val
# X_val = df2_validation.drop(columns='Target')
# y_val = df2_validation['Target']

# # List of models
# models = [
#     RandomForestClassifier(random_state=42),
#     GradientBoostingClassifier(random_state=42),
#     SVC(random_state=42),
#     KNeighborsClassifier(n_neighbors=5),
#     DecisionTreeClassifier(random_state=42),
#     LogisticRegression(random_state=42),
#     xgb.XGBClassifier(random_state=42),
#     AdaBoostClassifier(random_state=42),
#     GaussianNB(),
#     MLPClassifier(random_state=42),
#     LGBMClassifier(random_state=42),
#     CatBoostClassifier(random_state=42, verbose=False),
#     ExtraTreesClassifier(random_state=42)
# ]

# # Initialize an empty DataFrame for storing the results
# results_dfs = []

# # Iterate over the models
# for model in models:
#     # Train the model and predict on the validation set
#     model.fit(X_train, y_train)
#     y_val_pred = model.predict(X_val)

#     # Create a dataframe with 'y_val', 'y_val_pred', and 'Prediction' as columns
#     model_results = pd.DataFrame({
#         'Models_name': [type(model).__name__] * len(y_val),  # model name
#         'y_val': y_val,  # true labels
#         'Prediction': ['No' if y_true != y_pred else 'Yes' for y_true, y_pred in zip(y_val, y_val_pred)]  # predictions
#     })

#     # Append the results DataFrame to the list
#     results_dfs.append(model_results)
    
# # Concatenate the results DataFrames into a single DataFrame
# results_df_val = pd.concat(results_dfs)

# # Create summary DataFrame
# new_df_summary = results_df_val.groupby(['Models_name', 'y_val'])['Prediction'].value_counts().unstack(fill_value=0).reset_index()

# # Grouping for No/Yes Summing
# new_df_grouped = new_df_summary.groupby('Models_name').sum()

# # Print the data frames
# print(results_df_val)
# print(new_df_summary)
# print(new_df_grouped)