In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.datasets import load_linnerud
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from physlearn import Regressor
from sklearn.svm import LinearSVC
import os
import cv2
import re

In [34]:
df=pd.read_excel("data_new.xlsx")

# in dataset there are 2 empty rows as delimeter for classification
after_chem_columns = ["Ароматичность(химия)","Алифатичность(химия)","Разветвленность(химия)","Окисленность(химия)"," содержание серы(химия)",
              "плотность(химия)","вязкость(химия)","процент неразделяемой эмульсии(химия)"]
df = df.dropna(axis='rows')
df["класс химии"] = df["класс химии"].astype(int)
df["процент химии"] = df["процент химии"].astype(int)

df_train =df.iloc[:6000,:] 
df_valid =df.iloc[6000:,:] 
df_train = df_train.sample(frac=1)
df_train.head()

Unnamed: 0,Ароматичность,Алифатичность,Разветвленность,Окисленность,содержание серы,плотность,вязкость,процент неразделяемой эмульсии,класс химии,процент химии,Ароматичность(химия),Алифатичность(химия),Разветвленность(химия),Окисленность(химия),содержание серы(химия),плотность(химия),вязкость(химия),процент неразделяемой эмульсии(химия)
3099,0.54,7.6,0.36,2.6,2.7,742.0,296.0,47.0,3,15,15.34,15.4,12.16,13.4,7.5,745.8,301.8,51.8
2985,0.28,9.9,0.44,1.8,3.2,1018.0,107.0,45.0,3,10,11.08,16.1,10.24,10.6,8.0,1021.0,111.6,49.8
1817,0.27,7.3,0.42,2.4,2.2,743.0,269.0,34.0,2,15,15.17,15.0,12.32,13.3,7.1,746.9,274.9,38.9
1285,1.0,8.7,0.47,2.7,2.6,936.0,153.0,14.0,2,5,5.9,12.9,7.37,8.6,7.5,937.9,155.9,18.9
290,0.73,4.2,0.5,1.4,5.0,704.0,131.0,46.0,1,5,5.73,8.5,7.5,7.4,10.0,706.0,134.0,51.0


In [100]:
# for 6 classes and each percent rate 5,10,15,20 (percents per)
average_values_petrol = {}

for chem_class in range(1,7):
    inner_dict = {}
    for percent in [5,10,15,20]:
        inner_dict[percent] = df.loc[(df["класс химии"] == chem_class) & 
                                            (df["процент химии"]==percent),after_chem_columns].mean(axis=0)
    average_values_petrol[chem_class] = inner_dict
    
# for 6 classes and each for 5,10,15,20 (percents per) - output to excel file sheets
with pd.ExcelWriter("output.xlsx") as writer:
    for index in average_values_petrol.keys():
        df1 = pd.DataFrame(average_values_petrol[index])
        df1.to_excel(writer,sheet_name=str(index),index=False)
        
# Perfect petrol match after chemistry 
# 7,8043, 11,099, 9,2492, 12,641, 8,2895, 835,22, 234,49, 29,635

Предсказываем нефть после обработки - 8 столбцов значений

In [37]:
# Кусок кода взят отсюда: https://scikit-physlearn.readthedocs.io/en/latest/quick_start.html

bunch = load_linnerud(as_frame=True)  # returns a Bunch instance
X, y = df_train.iloc[:,:10], df_train.iloc[:,10:]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

transformer_list = [('pca', PCA(n_components=1)),
                    ('svd', TruncatedSVD(n_components=2))]
union = FeatureUnion(transformer_list=transformer_list, n_jobs=-1)

# Select a regressor, e.g., LGBMRegressor from LightGBM,
# with a case-insensitive string.
reg = Regressor(regressor_choice='lgbmregressor',
                pipeline_transform=('tr', union),
                scoring='neg_mean_absolute_error')

# Automatically build the pipeline with final estimator MultiOutputRegressor
# from Sklearn, then exhaustively search over the (hyper)parameters.
search_params = dict(reg__boosting_type=['gbdt', 'goss'],
                     reg__n_estimators=[6, 8, 10, 20])
reg.search(X_train, y_train, search_params=search_params,
           search_method='gridsearchcv')

# Generate predictions with the refit regressor, then
# compute the average mean absolute error.
y_pred = reg.fit(X_train, y_train).predict(X_test)
score = reg.score(y_test, y_pred)
print(round(score['mae'].mean(),2))
# print(score['mae'].mean().round(decimals=2))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Pipeline] ................ (step 1 of 2) Processing tr, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing reg, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing tr, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing reg, total=   0.0s
6.29


Предсказываем химию полученной нефти на сгенерированных предсказаниях нефти после химии

In [None]:
chemisrty_classes = df["класс химии"].unique()
percent_rate = df["процент химии"].unique()

X_valid= df_valid.iloc[:,:10]

for index in range(100):
    data = X_valid.sample(frac=1)[index:index+1]
    prediction_dict = {}
    for index in range(len(data)):
        for chm_class in np.sort(chemisrty_classes):
            data["класс химии"] = chm_class
            for rate in np.sort(percent_rate):
                data["процент химии"] = rate
                predictions = reg.predict(data).values
                prediction_dict[f"{chm_class}_{rate}_{index}"] = predictions[0]
            
    df_predictions = pd.DataFrame(prediction_dict.values())
    df_predictions["class"] = [int(key.split('_')[0]) for key in prediction_dict.keys()]
    df_predictions["%rate"] = [int(key.split('_')[1]) for key in prediction_dict.keys()]


    # Забиваем идеальную нефть и вытаскиваем ближайшего соседа - то есть класс нефти и процент
    # Это обратный подход, то есть мы как будто бы находим на самом деле какой класс и процент использовать,
    # чтобы максимально приблизиться к "Идеальному соотншению парметров в нефти"

    # targets - это слияние класса и процента химии

    # 7,8043, 11,099, 9,2492, 12,641, 8,2895, 835,22, 234,49, 29,635

    X_data = df_predictions.drop(columns=["class","%rate"])
    y_data = df_predictions[["class","%rate"]]
    from sklearn.neighbors import KNeighborsClassifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_data, y_data)
    perfect_petrol = [7.8043, 11.099, 9.2492, 12.641, 8.2895, 835.22, 234.49, 29.635]
    print(neigh.predict([perfect_petrol]))