In [2]:
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import gc 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
def read_file(path):
    df=pd.read_csv(path)
    return(df)

In [4]:
classifiers = [DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(10),
    GaussianNB(),
    LogisticRegression()]

def features(name,model,X_train):
    if name in ["DecisionTreeClassifier","RandomForestClassifier"]:
            importances = model.feature_importances_
            sorted_feature_importance = sorted(zip(importances, list(X_train)), reverse=True)
            sorted_imp=sorted_feature_importance[:5]
            return sorted_imp
    else:
        return 'NA'
        
def classifier(df):
    
    y_true = df.IsRenewal
    X = df.drop('IsRenewal', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.3, random_state=0)
    
    classifier_results=dict()
    sorted_imp=[]
    scored_df =[]
 
    for clf in classifiers:
        clf = clf.fit(X_train, y_train)
        name = clf.__class__.__name__
        
        sorted_imp = features(name,clf,X_train)
        
        
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        pre = precision_score(y_test,train_predictions, average='weighted')
        recall=recall_score(y_test,train_predictions, average='weighted')
        f1=f1_score(y_test, train_predictions, labels=None, pos_label=1, average='weighted', sample_weight=None)
        
        classifier_results[name]=[sorted_imp,acc*100,pre*100,recall*100,f1*100]
        score_df = pd.DataFrame(data=classifier_results, index=['top_5_features','Accuracy','Precision','Recall','F1 Score'])
        score_df=score_df.T
        score_df.index.name = 'Classifiers'   
        score_df.reset_index(level=0, inplace=True)
        score_df.sort_values('Accuracy',ascending= False,inplace= True)     
    
    return score_df 

        

In [5]:
df= read_file('./modeldata.csv')

In [6]:
df.columns

Index(['Unnamed: 0', 'IsSurplusLines', 'IsAgencyBilled', 'ProducerHoldFlag',
       'BinderExpectedFlag', 'IsRenewal', 'YearTerm', 'AppPolicy',
       'BindRequest', 'PriorClaim',
       ...
       'Westchester Fire Insurance Company',
       'Western Heritage Insurance Company', 'Western Surety Co.',
       'Western Surety Company', 'Western World', 'Western World Insurance Co',
       'Zenith Insurance Company', 'Zurich', 'Zurich Insurance Company',
       'Zurich Small Business'],
      dtype='object', length=351)

In [7]:
x=classifier(df)

In [8]:
x

Unnamed: 0,Classifiers,top_5_features,Accuracy,Precision,Recall,F1 Score
0,DecisionTreeClassifier,"[(0.44818077201498485, Unnamed: 0), (0.2114066...",94.4496,94.4493,94.4496,94.4494
1,RandomForestClassifier,"[(0.45246318900713495, Unnamed: 0), (0.1751316...",93.6126,93.5296,93.6126,93.5658
3,GaussianNB,,83.6182,86.3021,83.6182,76.1704
4,LogisticRegression,,83.6058,69.8993,83.6058,76.1406
2,KNeighborsClassifier,,83.0403,79.2148,83.0403,79.997
