In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from joblib import dump,load
from sklearn import metrics

In [None]:
def read_csv(name,sep=","):
    data = pd.read_csv(name,sep=sep)
    return data

def read_from_file(typ_,name,sep=None):
    if typ_ == "csv":
        data = read_csv(name,sep)
    else:
        data = None
    return data
    
def read_from_data(typ_,data):
    return None   

def read_from_file_or_data(typ_,sect=None,name=None,data=None,sep=None):
    if sect == "file":
        data = read_from_file(typ_,name,sep)
    else:
        data = read_from_data(typ_,data)
    return data

In [None]:
def write_dataframe_csv(dataframe,name):
    dataframe.to_csv(name)
    
def write_data(typ_,data,name):
    if typ_ == "csv":
        write_dataframe_csv(data,name)
    else:
        pass

In [None]:
def load_one(data):
    return data

def load_two(data1,data2,axis=0,drop_index=True):
    combined = pd.concat([data1,data2],axis=axis)
    combined = combined.sample(frac=1,random_state=2).reset_index(drop=drop_index)
    return combined

def loader(typ_,data,axis=0,drop_index=True):
    if typ_ == "one":
        data = load_one(data)
    elif typ_ == "two":
        data = load_two(data[0],data[1],axis=0,drop_index=True)
    else:
        data = None
    return data



In [None]:
def data_spliter(train,label,random_state=2,test_size=0.3):
    train_x,test_x,train_y,test_y = train_test_split(train,label,random_state=random_state,test_size=test_size)
    return train_x,test_x,train_y,test_y

def set_ss(data):
    ss = StandardScaler()
    scaled = ss.fit_transform(data)
    return scaled

def scalers(typ_,data):
    scaled = None
    if typ_ == "ss":
        scaled = set_ss(data)
    else:
        pass
    return scaled


def select_train_columns(data,n):
    train = data.iloc[:,:n]
    return train,train.columns
    
def select_label_column(data,column_name):
    labels = np.array(data[column_name])
    return labels

In [None]:
def dcl_train(train,label,model_name):
    dcl = DecisionTreeClassifier()
    dcl.fit(train,label)
    dump(dcl,model_name) 
    return dcl

def train(typ_,train,label,model_name):
    if typ_ == "dcl":
        return dcl_train(train,label,model_name)
    else:
        return None


In [None]:
def predict(model_name,data):
    model = load(model_name) 
    label = model.predict(data)
    return label

In [None]:
def retrain_with_new_data(model_name,train,label):
    model = load(model_name)
    model.fit(train,label)
    dump(model,model_name) 
    return model
    
    
def retrain_with_new_and_old_data(model_name,train,label):
    c_train = np.vstack([train[0], train[1]])
    clabel = np.concatenate([label[0], label[1]])
    model = load(model_name)
    model.fit(c_train,clabel)
    dump(model,model_name)
    return model
    
def retrain(typ_,model_name,train,label):
    if typ_ == "only_new":
        model = retrain_with_new_data(model_name,train,label)
    elif typ_ == "new_and_old":
        model = retrain_with_new_and_old_data(model_name,train,label)
    else:
        pass
    return model

In [None]:
def get_metrics(true_labels, predicted_labels):
    metrics = {}
    
    metrics['Accuracy:'] = np.round(metrics.accuracy_score(true_labels,predicted_labels),4)
    metrics['Precision:'] =  np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),4)
    metrics['Recall:'] = np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),4)
    metrics['F1 Score:'] = np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),4)
    return metrics

def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    return report
    
    
    
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
    stats = {}
    metrics = get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    report = display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,classes=classes)
    stats["metrics"] = metrics
    stats["report"] = report
    return stats

def model_analysis(true_labels,predicted_labels,classes,typ_="default"):
    if typ_ == "default":
        metrics = display_model_performance_metrics(true_labels, predicted_labels,classes)
    else:
        pass
    return metrics
    

In [None]:
def label_encoder(data):
    l = LabelEncoder()
    data = l.fit_transform(data)
    return data

In [None]:
def covert_objects_to_int(data):
    columns = data.columns
    for i in columns:
        if data[i].dtype == object:
            data[i] = label_encoder(data[i])
    return data



In [None]:
#model

def model(loader_type,data,data_type,scaler_type,model_type,model_name,analysis_type,train_column_limit,classes,label,test_ratio=0.3):
    data = loader(loader_type,data,axis=0,drop_index=True)
    
    #convert objects to int
    rta_ = covert_objects_to_int(rta_)
    
    #column and label selection
    rta_t,columns = select_train_columns(data,train_column_limit)
    rta_label = select_label_column(data,label)

    #spilting
    rta_t_train_x1,rta_t_test_x1,rta_label_train_y1,rta_label_test_y1 = data_spliter(rta_t,rta_label,random_state=2,test_size=test_ratio)

    #saving training and testing data
    write_data(data_type,rta_t_train_x1,"C:\\Users\\DIVINE\\Desktop\\assignment\\data\\train\\training_data")
    write_data(data_type,rta_t_test_x1,"C:\\Users\\DIVINE\\Desktop\\assignment\\data\\test\\testing_data")

    #scaling
    ss1_fit = scalers(scaler_type,rta_t_train_x1)
    ss1_test = scalers(scaler_type,rta_t_test_x1)


    #training of the model
    model = train(model_type,rta_t_train_x1,rta_label_train_y1,model_name)

    #predicting
    preds = predict(model_name,ss1_test)

    #evaluation
    metrics = model_analysis(rta_label_test_y1,preds,classes,typ_=analysis_type)
    return metrics


In [None]:
#prediction
def prediction(data,model_name,scaler_type):
    data = pd.Dataframe(data)
    data = covert_objects_to_int(data)
    data = scalers(scaler_type,data)
    preds = predict(model_name,data)
    return preds

In [None]:
#retraining
def retrained(train,label,test,test_label,classes,typ_,model_name,scaler_type):
    model = retrain(typ_=typ_,model_name=model_name,train=train,label=label)
    preds = model.predict(test)
    #evaluation
    metrics = model_analysis(true_labels=test_label,predicted_labels=preds,classes=classes,typ_=analysis_type) 
    return metrics

def process(loader_type,train,label,train_column_limit,data_type,test_ratio=0.3):
    train = loader(loader_type,train,axis=0,drop_index=True)
    
    #convert objects to int
    train = covert_objects_to_int(train)
    
    #column and label selection
    train,columns = select_train_columns(train,train_column_limit)
    label = select_label_column(train,label)
    
    #spilting
    train_x,test_x,label_x,test_y = data_spliter(train,label,random_state=2,test_size=test_ratio)

    #saving training and testing data
    write_data(data_type,train_x,"C:\\Users\\DIVINE\\Desktop\\assignment\\data\\train\\training_data")
    write_data(data_type,test_x,"C:\\Users\\DIVINE\\Desktop\\assignment\\data\\test\\testing_data")

    #scaling
    scaled_train_x = scalers(scaler_type,train_x)
    scaled_test_x = scalers(scaler_type,test_x)
    return scaled_train_x,scaled_test_x,label_x,test_y

In [None]:
data = "C:\\Users\\DIVINE\\Desktop\\assignment\\data\\train\\cleaned.csv"
rta = read_from_file_or_data(sect="file",typ_="csv",name=data,sep=",") 
rta_ = rta
rta_ = covert_objects_to_int(rta_)
rta_.head()

In [None]:
#Univariate Analysis
rta_.hist(bins=15,edgecolor="blue",color="red",linewidth=1.0,xlabelsize=8,ylabelsize=8,grid=False)
plt.tight_layout(rect=(0.0,0.0,1.2,1.2))
plt.suptitle("Univariate",x=0.6,y=1.3,fontsize=14)
plt.show()

In [None]:
#MulitVariate Analyis
fig,ax = plt.subplots(figsize=(20,10))
rta_corr = rta_.corr()
heatmap = sns.heatmap(round(rta_corr,2),ax=ax,annot=True,cmap="coolwarm",fmt=".2f",linewidths=1.5)
fig.subplots_adjust(top=0.95)
t = fig.suptitle("Heat Map Of The Correlation",y=1,fontsize=30)