In [2]:
from google.colab import files
uploaded = files.upload()

Saving winequality-white.csv to winequality-white.csv


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve

In [162]:
df = pd.read_csv('winequality-white.csv', sep=';')
df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6


In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [164]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

Попробуем обучить модель определяющию высококачественное вино. Вино с качеством 9, 8, 7 будет 1, остальное 0.

In [165]:
df_ch = df.copy()

In [166]:
df_ch.loc[df_ch['quality'] < 7, 'quality'] = 0
df_ch.loc[df_ch['quality'] >= 7, 'quality'] = 1
df_ch.rename(columns={'quality': 'target'}, inplace=True)
df_ch['target'].value_counts()

0    3838
1    1060
Name: target, dtype: int64

In [167]:
X_train, X_test, y_train, y_test = train_test_split(df_ch.drop(columns='target'), df_ch['target'], test_size=0.2, stratify=df_ch['target'], random_state=17)

In [168]:
model_rf = RandomForestClassifier(n_estimators=455, max_depth=30, min_samples_leaf=1, random_state=42)

model_rf.fit(X_train, y_train)
y_pred = model_rf.predict_proba(X_test)[:, 1]


In [169]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
fscore_cl = fscore[ix]
precision_cl = precision[ix]
recall_cl = recall[ix]
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.490110, F-Score=0.749, Precision=0.841, Recall=0.675


Пусть у нас известно только о вине качества 8 уровня. 

In [170]:
df_rns = df.copy()

In [171]:
def pu_change(df, i):
  pos_sample = df[df['quality'] >= 7].sample(i, random_state=27)
  pos_sample['target'] = 1
  unb_sample = df.drop(pos_sample.index)
  unb_sample ['target'] = -1
  df = pd.concat([unb_sample , pos_sample]).sample(frac=1, random_state=31)
  return df

In [172]:
df_rns = pu_change(df_rns, 200)
df_rns.loc[df_rns['target'] == 1, 'quality'].value_counts()

7    168
8     31
9      1
Name: quality, dtype: int64

In [173]:
def to_negative(df):
  neg_sample  = df[df['target'] == -1].sample(len(df[df['target'] == 1]), random_state=27)
  neg_sample['target'] = 0
  df = df.drop(neg_sample.index)
  df = pd.concat([df , neg_sample]).sample(frac=1, random_state=31)
  return df

In [174]:
df_rns = to_negative(df_rns)
df_rns.loc[df_rns['target'] == 0, 'quality'].value_counts()

6    107
5     47
7     29
8      9
4      6
3      2
Name: quality, dtype: int64

In [175]:
def choose_sample(df):

  pos_sample = df[df['target'] == 1]
  neg_sample = df[df['target'] == 0]
  return pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=31)

In [176]:
def create_test_sample(df):
  df.loc[((df['quality'] < 7) & (df['target'] == -1)), 'target'] = 0
  df.loc[((df['quality'] >= 7) & (df['target'] == -1)), 'target'] = 1
  return df

In [177]:
def use_model(df):
  train_sample = choose_sample(df)

  model_rf = RandomForestClassifier(n_estimators=455, max_depth=30, min_samples_leaf=1, random_state=42)
  model_rf.fit(train_sample.drop(columns=['target', 'quality']), train_sample['target'])
  y_pred = model_rf.predict_proba(create_test_sample(df).drop(columns=['target', 'quality']))[:, 1]

  return y_pred 


In [178]:
y_pred_rns = use_model(df_rns)

In [179]:
  precision, recall, thresholds = precision_recall_curve(create_test_sample(df_rns)['target'], y_pred_rns)
  fscore = (2 * precision * recall) / (precision + recall)
  # locate the index of the largest f score
  ix = np.argmax(fscore)
  fscore_rns = fscore[ix]
  precision_rns = precision[ix]
  recall_rns = recall[ix]
  print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                          fscore[ix],
                                                                          precision[ix],
                                                                          recall[ix]))

Best Threshold=0.636264, F-Score=0.601, Precision=0.599, Recall=0.602


In [180]:
pd.DataFrame([['Classification', precision_cl , recall_cl, fscore_cl],
              ['Random negative sampling', precision_rns, recall_rns, fscore_rns]],
              columns=['Method', 'Precision', 'Recall', 'F_score'])

Unnamed: 0,Method,Precision,Recall,F_score
0,Classification,0.841176,0.674528,0.748691
1,Random negative sampling,0.599415,0.601761,0.600586


In [182]:
df_rns = df.copy()
df_rns = pu_change(df_rns, 400)
df_rns = to_negative(df_rns)
y_pred_rns = use_model(df_rns)

precision, recall, thresholds = precision_recall_curve(create_test_sample(df_rns)['target'], y_pred_rns)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
fscore_rns = fscore[ix]
precision_rns = precision[ix]
recall_rns = recall[ix]
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                          recall[ix]))

Best Threshold=0.683516, F-Score=0.717, Precision=0.720, Recall=0.715


In [183]:
df_rns = df.copy()
df_rns = pu_change(df_rns, 100)
df_rns = to_negative(df_rns)
y_pred_rns = use_model(df_rns)

precision, recall, thresholds = precision_recall_curve(create_test_sample(df_rns)['target'], y_pred_rns)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
fscore_rns = fscore[ix]
precision_rns = precision[ix]
recall_rns = recall[ix]
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                          recall[ix]))

Best Threshold=0.568681, F-Score=0.540, Precision=0.480, Recall=0.618
