In [61]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.datasets import load_linnerud
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from physlearn import Regressor
from sklearn.svm import LinearSVC
import os
import cv2
import re

In [62]:
df=pd.read_excel("data_new.xlsx")

# in dataset there are 2 empty rows as delimeter for classification
after_chem_columns = ["Ароматичность(химия)","Алифатичность(химия)","Разветвленность(химия)","Окисленность(химия)"," содержание серы(химия)",
              "плотность(химия)","вязкость(химия)","процент неразделяемой эмульсии(химия)"]
df = df.dropna(axis='rows')
df["класс химии"] = df["класс химии"].astype(int)
df["процент химии"] = df["процент химии"].astype(int)

df_train =df.iloc[:6000,:] 
df_valid =df.iloc[6000:,:] 
df_train = df_train.sample(frac=1)
df_train.head()

Unnamed: 0,Ароматичность,Алифатичность,Разветвленность,Окисленность,содержание серы,плотность,вязкость,процент неразделяемой эмульсии,класс химии,процент химии,Ароматичность(химия),Алифатичность(химия),Разветвленность(химия),Окисленность(химия),содержание серы(химия),плотность(химия),вязкость(химия),процент неразделяемой эмульсии(химия)
4100,0.45,8.0,0.45,1.9,2.6,560.0,135.0,11.0,4,10,11.55,14.5,10.55,11.0,7.7,563.3,139.9,16.1
2824,0.12,4.5,0.36,1.9,5.1,791.0,169.0,2.0,3,10,10.92,10.7,10.16,10.7,9.9,794.0,173.6,6.8
574,0.18,6.4,0.35,2.9,5.0,981.0,123.0,3.0,1,10,11.18,12.8,10.35,11.9,10.0,984.2,127.8,8.0
2564,0.06,2.1,0.31,1.7,4.1,626.0,150.0,39.0,3,5,4.86,6.2,7.11,7.5,8.9,627.8,152.9,43.8
1042,0.45,7.9,0.32,2.8,2.0,770.0,193.0,12.0,1,20,17.45,16.4,13.32,14.8,7.0,774.4,199.6,17.0


In [3]:
# for 6 classes and each percent rate 5,10,15,20 (percents per)
average_values_petrol = {}

for chem_class in range(1,7):
    inner_dict = {}
    for percent in [5,10,15,20]:
        inner_dict[percent] = df.loc[(df["класс химии"] == chem_class) & 
                                            (df["процент химии"]==percent),after_chem_columns].mean(axis=0)
    average_values_petrol[chem_class] = inner_dict
    
# for 6 classes and each for 5,10,15,20 (percents per) - output to excel file sheets
with pd.ExcelWriter("output.xlsx") as writer:
    for index in average_values_petrol.keys():
        df1 = pd.DataFrame(average_values_petrol[index])
        df1.to_excel(writer,sheet_name=str(index),index=False)
        
# Perfect petrol match after chemistry 
# 7,8043, 11,099, 9,2492, 12,641, 8,2895, 835,22, 234,49, 29,635

Предсказываем нефть после обработки - 8 столбцов значений

In [63]:
# Кусок кода взят отсюда: https://scikit-physlearn.readthedocs.io/en/latest/quick_start.html

bunch = load_linnerud(as_frame=True)  # returns a Bunch instance
X, y = df_train.iloc[:,:10], df_train.iloc[:,10:]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

transformer_list = [('pca', PCA(n_components=1)),
                    ('svd', TruncatedSVD(n_components=2))]
union = FeatureUnion(transformer_list=transformer_list, n_jobs=-1)

# Select a regressor, e.g., LGBMRegressor from LightGBM,
# with a case-insensitive string.
reg = Regressor(regressor_choice='lgbmregressor',
                pipeline_transform=('tr', union),
                scoring='neg_mean_absolute_error')

# Automatically build the pipeline with final estimator MultiOutputRegressor
# from Sklearn, then exhaustively search over the (hyper)parameters.
search_params = dict(reg__boosting_type=['gbdt', 'goss'],
                     reg__n_estimators=[6, 8, 10, 20])
reg.search(X_train, y_train, search_params=search_params,
           search_method='gridsearchcv')

# Generate predictions with the refit regressor, then
# compute the average mean absolute error.
y_pred = reg.fit(X_train, y_train).predict(X_test)
score = reg.score(y_test, y_pred)
print(round(score['mae'].mean(),2))
# print(score['mae'].mean().round(decimals=2))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Pipeline] ................ (step 1 of 2) Processing tr, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing reg, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing tr, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing reg, total=   0.0s
6.11


In [7]:
# save the model above
import pickle
file_model = "pickle_regression_model.pkl"
with open(file_model,'wb') as f:
    pickle.dump(reg, f)

In [64]:
chemisrty_classes = df["класс химии"].unique()
percent_rate = df["процент химии"].unique()
print(chemisrty_classes,percent_rate)

[1 2 3 4 5 6] [ 5 10 15 20]


In [68]:
X_test

Unnamed: 0,Ароматичность,Алифатичность,Разветвленность,Окисленность,содержание серы,плотность,вязкость,процент неразделяемой эмульсии,класс химии,процент химии
5761,0.30,4.1,0.36,2.0,4.3,841.0,312.0,2.0,5,20
2695,0.82,9.3,0.41,1.7,5.0,912.0,224.0,19.0,3,5
3616,0.35,4.2,0.40,1.6,3.6,812.0,274.0,36.0,3,20
1764,0.54,6.3,0.47,1.6,6.3,864.0,308.0,6.0,2,10
2639,0.30,7.0,0.47,2.0,2.2,943.0,135.0,14.0,3,5
...,...,...,...,...,...,...,...,...,...,...
3542,0.98,9.7,0.48,2.2,3.6,986.0,319.0,47.0,3,20
1642,0.66,9.5,0.50,1.6,0.1,1060.0,142.0,14.0,2,10
321,0.17,7.0,0.38,1.4,6.5,617.0,200.0,7.0,1,10
2896,0.21,7.4,0.48,1.5,4.6,649.0,216.0,46.0,3,10


Предсказываем химию полученной нефти на сгенерированных предсказаниях нефти после химии

In [109]:
chemisrty_classes = [1, 2, 3, 4, 5, 6]
percent_rate = [5, 10, 15, 20]
input_excel = "input_data.xlsx"
df = pd.read_excel(input_excel)

# output df to excel concatenated with class and chemistry percent rate
df_output = df.sample(frac=1).sample(n=5)

In [106]:
for index in range(5):
    data = pd.DataFrame(df_output.iloc[index, :]).transpose()
    prediction_dict = dict()
    for chm_class in np.sort(chemisrty_classes):
        data["класс химии"] = chm_class
        for rate in np.sort(percent_rate):
            data["процент химии"] = rate
            predictions = reg.predict(data).values
            prediction_dict[f"{chm_class}_{rate}_{index}"] = predictions[0]

    df_predictions = pd.DataFrame(prediction_dict.values())
    df_predictions["class"] = [int(key.split('_')[0]) for key in prediction_dict.keys()]
    df_predictions["%rate"] = [int(key.split('_')[1]) for key in prediction_dict.keys()]

    # Забиваем идеальную нефть и вытаскиваем ближайшего соседа - то есть класс нефти и процент
    # Это обратный подход, то есть мы как будто бы находим на самом деле какой класс и процент использовать,
    # чтобы максимально приблизиться к "Идеальному соотншению парметров в нефти"

    # targets - это слияние класса и процента химии

    # 7,8043, 11,099, 9,2492, 12,641, 8,2895, 835,22, 234,49, 29,635

    X_data = df_predictions.drop(columns=["class", "%rate"])
    y_data = df_predictions[["class", "%rate"]]
    from sklearn.neighbors import KNeighborsClassifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_data, y_data)
    perfect_petrol = [7.8043, 11.099, 9.2492, 12.641, 8.2895, 835.22, 234.49, 29.635]
    prediction_class_percent = neigh.predict([perfect_petrol])
    df_output.iloc[index, 8] = prediction_class_percent[0][0]
    df_output.iloc[index, 9] = prediction_class_percent[0][1]


In [8]:
# save the model above
import pickle
file_model = "pickle_KNN_model.pkl"
with open(file_model,'wb') as f:
    pickle.dump(neigh, f)