In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, tree, svm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics


In [2]:
# Подготовка данных 

path = 'data/DataSet_w_NA.xlsx'
df = pd.read_excel(path, sheet_name='Испорченные факты').dropna()


df = pd.pivot_table(df, values=['Продажи, руб', 'Продажи, шт','Повторение заказа', 'Маржинальная прибыль','Повторение товара'],
                    index=["Факты.Товар ID"], aggfunc={'Продажи, шт': [np.median, np.sum], 'Продажи, руб': np.sum, 
                                                       'Повторение заказа': np.sum, 'Маржинальная прибыль': np.sum})
df.columns = df.columns.map('_'.join)
df = df.reset_index()

total_sale = df['Продажи, руб_sum'].sum()
df['Доля'] = df['Продажи, руб_sum'] / total_sale * 100
df = df.assign(sum_d=df['Доля'].cumsum())

df.loc[(df['sum_d'] <= 80), 'ABC'] = 'A'
df.loc[(df['sum_d'] > 80) & (df['sum_d'] <= 95), 'ABC'] = 'B'
df.loc[(df['sum_d'] > 95), 'ABC'] = 'C'

In [3]:
def model_job(model, df_in, df_out, name_of_mo):
    
    x_train, x_test, y_train, y_test = model_selection.train_test_split(df_in, df_out, test_size=0.3, stratify=df_out)
    
    ss = StandardScaler()
    x_train = ss.fit_transform(x_train)
    x_test = ss.transform(x_test)
    
    model.fit(x_train, y_train)
    y_test_predicted = model.predict(x_test)
    
    df_res = pd.DataFrame({'y_test': y_test,
                          'y_test_predicted': y_test_predicted})
    
    df_res.loc[(df_res['y_test'] != df_res['y_test_predicted']), 'Изменение класса'] = 1
    df_res.loc[(df_res['y_test'] == df_res['y_test_predicted']), 'Изменение класса'] = 0
    
    print(name_of_mo, 'не правильно предсказано: ',  df_res['Изменение класса'].sum(), '/', len(df_res))
    print("Accuracy:", metrics.accuracy_score(y_test, y_test_predicted))
#     print(df_res.to_string())
    
    

In [4]:

df_in = df.drop(['Факты.Товар ID','ABC'], axis=1)
df_out = df.ABC


In [9]:
model_job(RandomForestClassifier(), df_in, df_out, 'Random Forest Classifier')

Random Forest Classifier не правильно предсказано:  1.0 / 559
Accuracy: 0.998211091234347


In [13]:
model_job(tree.DecisionTreeClassifier(), df_in, df_out, 'Decision Tree Classifier')

Decision Tree Classifier не правильно предсказано:  0.0 / 559
Accuracy: 1.0


In [14]:
model_job(svm.SVC(), df_in, df_out, 'Support Vector Machine')

Support Vector Machine не правильно предсказано:  28.0 / 559
Accuracy: 0.9499105545617174
