In [979]:
import pandas as pd
import numpy as np
import random

class ProcessingData:
    @staticmethod
    def shuffle(data: pd.DataFrame) -> pd.DataFrame:
        for i in range(len(data)):
            j = random.randint(0,len(data)-1)
            data.iloc[i],data.iloc[j] = data.iloc[j],data.iloc[i]
        return data
    @staticmethod
    def normalize(data: pd.DataFrame,columns: list) -> pd.DataFrame:
        for column in columns:
            x = data[column].max()
            y = data[column].min()
            data[column] = (data[column]-y)/(x-y)
        return data      
    @staticmethod
    def split(data: pd.DataFrame) -> pd.DataFrame:
        train_set = pd.DataFrame(columns=data.columns)
        test_set = pd.DataFrame(columns=data.columns)
        for i in range(len(data)):
            if random.random() < 0.7:
                train_set = train_set.append(data.iloc[i])
            else:
                test_set = test_set.append(data.iloc[i])
        return train_set,test_set


In [980]:
irisdf = pd.read_csv('iris.csv')
irisdf

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [981]:
irisdf = ProcessingData.shuffle(irisdf)
irisdf = ProcessingData.normalize(irisdf,['sepal.length','sepal.width','petal.length','petal.width'])
train_set,test_set = ProcessingData.split(irisdf)
print(train_set)

     sepal.length  sepal.width  petal.length  petal.width     variety
0        0.083333     0.500000      0.067797     0.041667      Setosa
2        0.305556     0.791667      0.050847     0.125000      Setosa
3        0.361111     0.333333      0.661017     0.791667   Virginica
5        0.305556     0.791667      0.118644     0.125000      Setosa
8        0.583333     0.500000      0.593220     0.583333  Versicolor
..            ...          ...           ...          ...         ...
140      0.333333     0.916667      0.067797     0.041667      Setosa
142      0.416667     0.333333      0.694915     0.958333   Virginica
143      0.388889     0.750000      0.118644     0.083333      Setosa
147      0.361111     0.416667      0.525424     0.500000  Versicolor
148      0.583333     0.291667      0.728814     0.750000   Virginica

[104 rows x 5 columns]


In [982]:
class softClassificator:
    @staticmethod
    def fit(df: pd.DataFrame):
        means = {}
        mins = {}
        maxes = {}
        for variety in df['variety'].unique():
            means[variety] = {}
            mins[variety] = {}
            maxes[variety] = {}
            for atributte in df.columns:
                if atributte != "variety":
                    means[variety][atributte] = df[df['variety'] == variety][atributte].mean()
                    mins[variety][atributte] = df[df['variety'] == variety][atributte].min()
                    maxes[variety][atributte] = df[df['variety'] == variety][atributte].max()
                    for row in df.index:
                        if df.loc[row,atributte] < means[variety][atributte]:
                            df.loc[row,atributte] = 0
                        else:
                            df.loc[row,atributte] = 1
        return df,means,mins,maxes

    @staticmethod
    def predict_sample(sample:pd.Series, means:dict, mins:dict, maxes:dict) -> str:
        for variety in means.keys():
            for atributte in means[variety].keys():
                if sample[atributte] < means[variety][atributte]:
                    sample[atributte] = 0
                else:
                    sample[atributte] = 1
        max_probability = 0
        max_variety = ""
        for variety in means.keys():
            probability = 1
            for atributte in means[variety].keys():
                if sample[atributte] == 0:
                    probability *= (1-means[variety][atributte])
                else:
                    probability *= means[variety][atributte]
            if probability > max_probability:
                max_probability = probability
                max_variety = variety
        return max_variety

    @staticmethod
    def accuracy(df: pd.DataFrame, means:dict, mins:dict, maxes:dict) -> float:
        correct = 0
        for row in df.index:
            if df.loc[row,'variety'] == softClassificator.predict_sample(df.loc[row],means,mins,maxes):
                correct += 1
        return f"{round(correct/len(df) * 100,2)}%"


In [983]:
train_set,means,mins,maxes = softClassificator.fit(train_set)

In [984]:
print(softClassificator.accuracy(test_set,means,mins,maxes))

69.57%


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()
