In [5]:
import pandas as pd
import numpy as np

def macro_f1(y_pre,y_test):
    f1_scores =[]
    unique_value=np.unique(y_test)
    for value in unique_value:
        true_pos = np.sum((y_pre==value)&(y_test==value))
        false_pos = np.sum((y_pre==value)&(y_test!=value))
        false_neg = np.sum((y_pre!=value)&(y_test==value))
        true_neg =np.sum((y_pre!=value)&(y_test!=value))
        recall = true_pos/(true_pos+false_neg)
        precision = true_pos/(true_pos+false_pos)
        if(recall+precision==0):
            f1_scores.append(0)
            continue
        f1 = 2*(recall*precision)/(recall+precision)
        f1_scores.append(f1)
    return np.mean(f1_scores)
        
  
def f1_score(y_pre,y_test):
    data = pd.DataFrame({
        'predicted':y_pre,
        'actual':y_test,
    })
    true_pos = data[(data['predicted']==1)& (data['actual']==1)].shape[0]
    true_neg =  data[(data['predicted']==0)& (data['actual']==0)].shape[0]
    false_pos =  data[(data['predicted']==1)&(data['actual']==0)].shape[0]
    false_neg =  data[(data['predicted']==0)& (data['actual']==1)].shape[0]

    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    f1_score = 2*(precision*recall)/(precision+recall)
    return f1_score


def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1/(1+np.exp(-z))

class Logistic_Regression:
    def __init__(self,epoch=1000,rate=0.01,):
        self.rate = rate
        self.numb = epoch
        self.class_index={}
    def fit(self,x_train,y_train):
        y_train = np.array(y_train)
        self.output = len(np.unique(y_train))
        if(self.output>2):
            self.activation=self.softmax
            y_train = self.one_hot_coding(y_train)
            self.coffei = np.ones((x_train.shape[1]+1, self.output))
        else:
            self.activation=sigmoid
            self.coffei = np.ones(x_train.shape[1]+1)
        x_train = np.array(x_train)
        x_train = np.insert(x_train,0,1,axis=1)
        
        for i in range(self.numb):
            y_hat = self.activation(np.dot(x_train,self.coffei))
            self.coffei = self.coffei + np.dot(x_train.T,(y_train-y_hat))*self.rate/x_train.shape[0]
    def predict(self,x_test,value=0.5):
        x_test = np.array(x_test)
        x_test = np.insert(x_test,0,1,axis=1)
        results = self.activation(np.dot(x_test,self.coffei))
        
        final_result = []
        if(self.activation==sigmoid):
            for i in range(results.shape[0]):
                if(results[i]>=value):
                    final_result.append(1)
                else:
                    final_result.append(0)
        else:
            if(self.activation==self.softmax):
            
                result= np.argmax(results,axis=1)
                
                reverse_dic = {}
            
                for key,value in self.class_index.items():
                    reverse_dic[value]=key
                
                
            
                for i in result:
                    final_result.append(reverse_dic[i])
                
            return final_result
            
            
            
        return final_result
    def softmax(self,z):
        z=z-np.max(z,axis=1,keepdims=True)
        exp_z=np.exp(z)
        return exp_z/(np.sum(exp_z,axis=1,keepdims=True))
    def one_hot_coding(self,y_test):
        y_test = np.array(y_test)
        
        pos = 0
        for i in y_test:
            if i not in self.class_index:
                self.class_index[i]=pos
                pos = pos+1
        target = []
        for i in y_test:
            new = []
            for j in range(self.output):
                if(j==self.class_index[i]):
                    new.append(1)
                else:
                    new.append(0)
            target.append(new)
        return np.array(target)
        
        

class Bag_logistic:
    def __init__(self,percent,lg_count):
        self.lg_count = lg_count
        self.percent = percent
    def bag(self,dataset):
        self.dataset = dataset
        sample_datasets = []
        
        for i in range(self.lg_count):
            datasets = self.dataset.sample(int(dataset.shape[0]*self.percent),random_state=i)
            sample_datasets.append(datasets)

        self.sample_datasets_target = []
        for i in range(self.lg_count):
            target_column = sample_datasets[i]['target']
            sample_datasets[i] = sample_datasets[i].drop(columns=['target'])
            self.sample_datasets_target.append(target_column)

        self.lg_object = []
        for i in range(self.lg_count):
            lgobject = Logistic_Regression(1000,0.01)
            lgobject.fit(sample_datasets[i],self.sample_datasets_target[i])
            self.lg_object.append(lgobject)
        print("Trained Successfully")

        



    def bag_predict(self,test_dataset,value):
        final_output = []
        for i in range(self.lg_count):
            result = self.lg_object[i].predict(test_dataset,value)
            final_output.append(result)
        final_output = np.array(final_output)
        votes = np.sum(final_output,axis=0)
        majority = (votes >= (self.lg_count // 2 + 1)).astype(int)
        return majority

        
        



In [6]:
class Train_test_split:
    def __init__(self,dataset,test_size=0.2,random=1,target='target'):
        self.target = target
        self.random_state=random
        self.test_size=test_size
        self.dataset = dataset
        
        self.row = self.dataset.shape[0]


    def split(self):
        
        np.random.seed(self.random_state)
        row_count = int(self.row*self.test_size)
        value = np.arange(0,self.row)
        row_no = np.random.choice(value,replace=False,size=row_count)
        

        test_dataset = self.dataset.iloc[row_no]
        x_test = test_dataset.drop(columns=[self.target])
        y_test = test_dataset[self.target]

        remaining_row=[]
        for i in range(self.row):
            if i not in row_no:
                remaining_row.append(i)
        train_dataset = self.dataset.iloc[remaining_row]
        x_train = train_dataset.drop(columns=[self.target])
        y_train = train_dataset[self.target]
        
        return x_train,x_test,y_train,y_test
      

In [7]:

class Scaler:
    def __init__(self):
        self.mean = None
        self.std = None
        
    def scale(self,dataset):
        
        self.mean = dataset.mean()
        self.std = dataset.std()
        
    
    def transform(self,dataset):
        new_dataset = (dataset-self.mean)/(self.std)
        return new_dataset
    

In [8]:
data = pd.read_csv('train_binary.csv')
t = Train_test_split(data,target='label')
x_train,x_test,y_train,y_test = t.split()


s = Scaler()
s.scale(x_train)
x_train = s.transform(x_train)
x_test = s.transform(x_test)
x_train['target']=y_train

In [9]:
b = Bag_logistic(0.7,50)
b.bag(x_train)

Trained Successfully


In [10]:
out = b.bag_predict(x_test,.85)

In [11]:
score=f1_score(out,y_test)

In [12]:
score

0.949025974025974

In [20]:
test_data = pd.read_csv('test_binary.csv')
test_data=s.transform(data)
pre_value = b.bag_predict(test_data,.85)
id_ = np.arange(test_data.shape[0])
submit = pd.DataFrame({
    'id':id_,
    'predicted_value' :pre_value
})
submit.to_csv('Binary_Submission.csv',index=False)

In [41]:
data = pd.read_csv('train_multi_class_new.csv')
t = Train_test_split(data,target='target')
x_train,x_test,y_train,y_test = t.split()


s = Scaler()
s.scale(x_train)
x_train = s.transform(x_train)
x_test = s.transform(x_test)

In [36]:
lg = Logistic_Regression(epoch=13000,rate=0.5)
lg.fit(x_train,y_train)

In [37]:
y_pre=lg.predict(x_test,0.5)

In [38]:
score = macro_f1(y_pre,y_test)

In [39]:
score

np.float64(0.5867394968830728)

In [45]:
test_data = pd.read_csv('test_multi_class.csv')
test_data=s.transform(test_data)
pre_value = lg.predict(test_data,.5)
id_ = np.arange(test_data.shape[0])
submit = pd.DataFrame({
    'id':id_,
    'predicted_value' :pre_value
})
submit.to_csv('multi-class_Submission.csv',index=False)