In [3]:
# loading necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import scipy.stats as stat
import pylab
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [1]:
# Load the dataset

In [2]:
# Basic EDA
def EDA(data_frame):
    print('The 1st 5 columns are:\n',data_frame.head())
    print('****************************\n')
    print('The shape of the dataset is:\n',data_frame.shape)
    print('****************************\n')
    print('The information of the dataset is:\n',data_frame.info())
    print('****************************\n')
    print('The description of the dataset is:\n',data_frame.describe())
    print('The description of the dataset is:\n',data_frame.describe(include='O'))
    print('****************************\n')
    print('The number of null values is:\n',data_frame.isnull().sum())
    print('****************************\n')
    print('The percentage of null values is:\n',data_frame.isnull().mean())
    print('****************************\n')
    print('The number of duplicated rows in the dataset are:\n',data_frame.duplicated().sum())



In [3]:
# function for removing duplicated rows
def remove_duplicates(data_frame):
    new_df=data_frame.drop_duplicates()
    new_df.reset_index(drop=True)
    return new_df

In [4]:
# function for separating numerical and categorical values

def separate_numcat_feature(data_frame):
    numerical_features=[feature for feature in data_frame.columns if data_frame[feature].dtype!='O']
    categorical_features=[feature for feature in data_frame.columns if feature not in numerical_features]
    return numerical_features,categorical_features

In [5]:
# pipeline for handling null values
numeric_processor=Pipeline(steps=[("imputation_median",SimpleImputer(missing_values=np.nan,strategy="median"))])


In [5]:
# function for detecting and handling null values

def handlingOutliers(new_df):
    for feature in numerical_features:
        Q1=new_df[feature].quantile(0.25)
        Q3=new_df[feature].quantile(0.75)
        IQR=Q3-Q1
        lowerLimit=Q1-(1.5*IQR)
        upperLimit=Q3+(1.5*IQR)
        new_df.loc[(new_df[feature]<lowerLimit),feature]=lowerLimit
        new_df.loc[(new_df[feature]>upperLimit),feature]=upperLimit
    return new_df

In [8]:
# function for transformation
def transform(new_df):
    for feature in numerical_features:
        new_df[feature],transval=stat.boxcox(new_df[feature])
    return new_df

In [20]:
#function for categorical encoding
def categoricalEncoding(data_frame,feature,method):
    if method=='oneHotEncoding':
        data_frameNew=pd.get_dummies(data_frame,columns=[feature],dtype=int)
    elif method=='LabelEncoder':
        encoder=LabelEncoder()
        data_frame[feature]=encoder.fit_transform(data_frame[feature])
        data_frameNew=data_frame
        
    return data_frameNew








In [4]:
pipelineLr=Pipeline(steps=[("scaler1",MinMaxScaler()),("Lr_clf",LogisticRegression(multi_class='multinomial'))])

In [5]:
pipeLineDt=Pipeline(steps=[("scaler2",MinMaxScaler()),("DT_clf",DecisionTreeClassifier())])

In [6]:
from sklearn import set_config

In [7]:
pipelineRnn=Pipeline(steps=[("scaler3",MinMaxScaler()),("Rnn_clf",RandomForestClassifier(n_estimators=100))])

In [8]:
pipeLineSvc=Pipeline(steps=[("scaler4",MinMaxScaler),("svc_clf",SVC(kernel="rbf"))])

In [9]:
pipeline=[pipelineLr,pipeLineDt,pipelineRnn,pipeLineSvc]

In [None]:
for pipe in pipeline:
    pipe.fit(X_train,Y_train)

In [11]:
best_accuracy =0.0
best_classifier=0
best_pipeline=""

In [None]:
for model in enumerate(pipeline):
    print(f"test accuracy for {model}: {model.score(X_test,Y_test)}")



In [2]:
pip_dict={"0":"LogisticRegression","1":"DecisiontreeClassifier","2":"RandomForestClassifier","3":"SupportvectorClassifier"}

In [None]:
for i,model in enumerate(pipeline):
    if model.score(X_test,Y_test) > best_accuracy:
        best_accuracy=model.score(X_test,Y_test)
        best_pipeline=model
        best_classifier=i
print(f"classifier with best accuracy is {pip_dict[best_classifier]}")