In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import PredefinedSplit
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


## Q1: Tree Classifiers

#### Extracting data into dataframes

In [3]:
noClauses = [300,500,1000,1500,1800]
noSamples = [100,1000,5000]

trainSampleNames = []
trainData = {}

validationSampleNames = []
validationData = {}

testSampleNames = []
testData = {}

for i in range (len(noClauses)):
    for j in range (len(noSamples)):
        tempName = "train_c"+str(noClauses[i])+"_d"+str(noSamples[j])
        trainSampleNames.append(tempName)
        trainData[tempName] = pd.read_csv("hw3_part1_data/all_data/"+tempName+".csv",sep=",",header=None)

        tempName = "test_c"+str(noClauses[i])+"_d"+str(noSamples[j])
        testSampleNames.append(tempName)
        testData[tempName] = pd.read_csv("hw3_part1_data/all_data/"+tempName+".csv",sep=",",header=None)

        tempName = "valid_c"+str(noClauses[i])+"_d"+str(noSamples[j])
        validationSampleNames.append(tempName)
        validationData[tempName] = pd.read_csv("hw3_part1_data/all_data/"+tempName+".csv",sep=",",header=None)




#### Using the validation set to tune the parameters.
#### We will be using the predefined split package to use Training data to train the model and use the validation set to validate and tune the parameters

In [190]:

tree = DecisionTreeClassifier(random_state=0)
grid = [{'criterion': ['gini', 'entropy'],
         'max_depth': [2, 5, 9, None],
         'min_samples_split': [2, 7, 10 , 20, 30, 50],
         'min_samples_leaf': [1, 8, 15],
         'max_features' : [5, 9 , None]
         }]

bestParameters = {}
results = {}

for i in trainSampleNames:
    
    # -1 - train
    # 0 - validate
    temp1 = np.full( (trainData[i]).shape[0] , -1 )
    temp2 = np.full( ((validationData[validationSampleNames[trainSampleNames.index(i)]])).shape[0] , 0)

    split_index =   np.concatenate((temp1, temp2), axis=0)
    # print(split_index)
    pds = PredefinedSplit(test_fold = split_index)

    TrainDataset = (trainData[i]).append((validationData[validationSampleNames[trainSampleNames.index(i)]]) , ignore_index = True)
    # print(TrainDataset)

    print("Training - ", i)

    GS = GridSearchCV(estimator = tree, param_grid = grid[0], scoring = 'accuracy', cv = pds)
    GS.fit( TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1] )
    print("Best Score achieved : {:.4f}".format(GS.best_score_))
    print("Parameters that give the best results : ",(GS.best_params_))
    bestParameters[i] = GS.best_params_
    
    # Relearning using the best parameters found above to find the accuracy and F1 score on test dataset

    testTree = DecisionTreeClassifier(criterion = (bestParameters[i])['criterion'] , max_depth= (bestParameters[i])['max_depth'], max_features= (bestParameters[i])['max_features'], min_samples_leaf= (bestParameters[i])['min_samples_leaf'], min_samples_split= (bestParameters[i])['min_samples_split'])
    testTree.fit(TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1])
    predictions = testTree.predict((testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,:500])
    results[i] = classification_report( (testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,-1] , predictions)

    print("resultsssss \n",results[i])

Training -  train_c300_d100
Best Score achieved : 0.6650
Parameters that give the best results :  {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 20}
resultsssss 
               precision    recall  f1-score   support

           0       0.63      0.57      0.60       100
           1       0.61      0.67      0.64       100

    accuracy                           0.62       200
   macro avg       0.62      0.62      0.62       200
weighted avg       0.62      0.62      0.62       200

Training -  train_c300_d1000
Best Score achieved : 0.6520
Parameters that give the best results :  {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
resultsssss 
               precision    recall  f1-score   support

           0       0.73      0.55      0.62      1000
           1       0.64      0.80      0.71      1000

    accuracy                           0.67      2000
   macro avg   

In [6]:

bagging = BaggingClassifier(DecisionTreeClassifier())
grid = [{'n_estimators': [2, 40, 60],
         'bootstrap': [True, False],
         'bootstrap_features': [True, False],
         'warm_start' : [True, False]
         }]

bestParametersBagging = {}
resultsBagging = {}

for i in trainSampleNames:
    
    # -1 - train
    # 0 - validate
    temp1 = np.full( (trainData[i]).shape[0] , -1 )
    temp2 = np.full( ((validationData[validationSampleNames[trainSampleNames.index(i)]])).shape[0] , 0)

    split_index =   np.concatenate((temp1, temp2), axis=0)
    # print(split_index)
    pds = PredefinedSplit(test_fold = split_index)

    TrainDataset = (trainData[i]).append((validationData[validationSampleNames[trainSampleNames.index(i)]]) , ignore_index = True)
    # print(TrainDataset)

    print("Training - ", i)

    gridSearchCv = GridSearchCV(estimator = BaggingClassifier(), param_grid = grid[0], scoring = 'accuracy', cv = pds)
    # print(gridSearchCv)
    gridSearchCv.fit( TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1] )
    print("Best Score achieved : {:.4f}".format(gridSearchCv.best_score_))
    print("Parameters that give the best results : ",(gridSearchCv.best_params_))
    bestParametersBagging[i] = gridSearchCv.best_params_
    
    # Relearning using the best parameters found above to find the accuracy and F1 score on test dataset

    testBagging = BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators= (bestParametersBagging[i])['n_estimators'], bootstrap= (bestParametersBagging[i])['bootstrap'], bootstrap_features= (bestParametersBagging[i])['bootstrap_features'] , warm_start = (bestParametersBagging[i])['warm_start'] )
    testBagging.fit(TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1])
    predictions = testBagging.predict((testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,:500])
    resultsBagging[i] = classification_report( (testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,-1] , predictions)

    print("resultsssss \n",resultsBagging[i])

Training -  train_c300_d100
Best Score achieved : 0.7250
Parameters that give the best results :  {'bootstrap': False, 'bootstrap_features': True, 'n_estimators': 40, 'warm_start': False}
resultsssss 
               precision    recall  f1-score   support

           0       0.77      0.70      0.73       100
           1       0.72      0.79      0.76       100

    accuracy                           0.74       200
   macro avg       0.75      0.74      0.74       200
weighted avg       0.75      0.74      0.74       200

Training -  train_c300_d1000
Best Score achieved : 0.8635
Parameters that give the best results :  {'bootstrap': True, 'bootstrap_features': False, 'n_estimators': 60, 'warm_start': True}
resultsssss 
               precision    recall  f1-score   support

           0       0.89      0.85      0.87      1000
           1       0.86      0.89      0.87      1000

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      20

In [10]:

randomForest = RandomForestClassifier()

grid = [{'n_estimators' :[10, 50, 100],
         'criterion': ['gini', 'entropy'],
         'max_depth': [5, 10,  None],
         'min_samples_split': [2, 10 , 20],
         'bootstrap': [True, False],
         'warm_start' : [True, False]
         }]

bestParametersrandomForest = {}
resultsrandomForest = {}

for i in trainSampleNames:
    
    # -1 - train
    # 0 - validate
    temp1 = np.full( (trainData[i]).shape[0] , -1 )
    temp2 = np.full( ((validationData[validationSampleNames[trainSampleNames.index(i)]])).shape[0] , 0)

    split_index =   np.concatenate((temp1, temp2), axis=0)
    # print(split_index)
    pds = PredefinedSplit(test_fold = split_index)

    TrainDataset = (trainData[i]).append((validationData[validationSampleNames[trainSampleNames.index(i)]]) , ignore_index = True)
    # print(TrainDataset)

    print("Training - ", i)

    gridSearchCv = GridSearchCV(estimator = RandomForestClassifier(), param_grid = grid[0], scoring = 'accuracy', cv = pds)
    # print(gridSearchCv)
    gridSearchCv.fit( TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1] )
    print("Best Score achieved : {:.4f}".format(gridSearchCv.best_score_))
    print("Parameters that give the best results : ",(gridSearchCv.best_params_))
    bestParametersrandomForest[i] = gridSearchCv.best_params_
    
    # Relearning using the best parameters found above to find the accuracy and F1 score on test dataset

    testrandomForest = RandomForestClassifier(n_estimators= (bestParametersrandomForest[i])['n_estimators'], criterion= (bestParametersrandomForest[i])['criterion'], max_depth= (bestParametersrandomForest[i])['max_depth'], min_samples_split= (bestParametersrandomForest[i])['min_samples_split'] , bootstrap = (bestParametersrandomForest[i])['bootstrap'], warm_start = (bestParametersrandomForest[i])['warm_start'] )
    testrandomForest.fit(TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1])
    predictions = testrandomForest.predict((testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,:500])
    resultsrandomForest[i] = classification_report( (testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,-1] , predictions)

    print("resultsssss \n",resultsrandomForest[i])

Training -  train_c300_d100
Best Score achieved : 0.8050
Parameters that give the best results :  {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 20, 'n_estimators': 100, 'warm_start': False}
resultsssss 
               precision    recall  f1-score   support

           0       0.84      0.77      0.80       100
           1       0.79      0.85      0.82       100

    accuracy                           0.81       200
   macro avg       0.81      0.81      0.81       200
weighted avg       0.81      0.81      0.81       200

Training -  train_c300_d1000
Best Score achieved : 0.8775
Parameters that give the best results :  {'bootstrap': True, 'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 100, 'warm_start': True}
resultsssss 
               precision    recall  f1-score   support

           0       0.87      0.86      0.86      1000
           1       0.86      0.87      0.86      1000

    accuracy                       

In [4]:

gradientBoosting = GradientBoostingClassifier()

grid = [{'n_estimators' :[10, 50, 100],
         'criterion': ['friedman_mse','mse'],
         'max_depth': [3, 10,  None],
         'min_samples_split': [2, 10 , 20],
         'min_samples_leaf' : [1,5,0.5],
         'max_features' : ['sqrt','log2'],
         'learning_rate' : [1.0,0.075,0.5]
         }]

bestParametersgradientBoosting = {}
resultsgradientBoosting = {}

for i in trainSampleNames:
    
    # -1 - train
    # 0 - validate
    temp1 = np.full( (trainData[i]).shape[0] , -1 )
    temp2 = np.full( ((validationData[validationSampleNames[trainSampleNames.index(i)]])).shape[0] , 0)

    split_index =   np.concatenate((temp1, temp2), axis=0)
    # print(split_index)
    pds = PredefinedSplit(test_fold = split_index)

    TrainDataset = (trainData[i]).append((validationData[validationSampleNames[trainSampleNames.index(i)]]) , ignore_index = True)
    # print(TrainDataset)

    print("Training - ", i)

    gridSearchCv = GridSearchCV(GradientBoostingClassifier(), param_grid = grid[0], scoring = 'accuracy', cv = pds)
    # print(gridSearchCv)
    gridSearchCv.fit( TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1] )
    print("Best Score achieved : {:.4f}".format(gridSearchCv.best_score_))
    print("Parameters that give the best results : ",(gridSearchCv.best_params_))
    bestParametersgradientBoosting[i] = gridSearchCv.best_params_
    
    # Relearning using the best parameters found above to find the accuracy and F1 score on test dataset

    testgradientBoosting = GradientBoostingClassifier(n_estimators= (bestParametersgradientBoosting[i])['n_estimators'], criterion= (bestParametersgradientBoosting[i])['criterion'], max_depth= (bestParametersgradientBoosting[i])['max_depth'], min_samples_split= (bestParametersgradientBoosting[i])['min_samples_split'] , min_samples_leaf = (bestParametersgradientBoosting[i])['min_samples_leaf'],max_features = (bestParametersgradientBoosting[i])['max_features'] )
    testgradientBoosting.fit(TrainDataset.iloc[:,:500], TrainDataset.iloc[:,-1])
    predictions = testgradientBoosting.predict((testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,:500])
    resultsgradientBoosting[i] = classification_report( (testData[testSampleNames[trainSampleNames.index(i)]]).iloc[:,-1] , predictions)

    print("resultsssss \n",resultsgradientBoosting[i])

Training -  train_c300_d100
Best Score achieved : 0.7900
Parameters that give the best results :  {'criterion': 'mse', 'learning_rate': 0.5, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
resultsssss 
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       100
           1       0.83      0.81      0.82       100

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200

Training -  train_c300_d1000
Best Score achieved : 0.8910
Parameters that give the best results :  {'criterion': 'friedman_mse', 'learning_rate': 0.5, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 20, 'n_estimators': 100}
resultsssss 
               precision    recall  f1-score   support

           0       0.87      0.86      0.86      1000
           1       0.86  