In [None]:
# https://www.kaggle.com/competitions/cyprus-ai-camp-the-enigma-of-wine-quality

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("/kaggle/input/cyprus-ai-camp-the-enigma-of-wine-quality/train.csv")
valid = pd.read_csv("/kaggle/input/cyprus-ai-camp-the-enigma-of-wine-quality/val.csv")

test1 = pd.read_csv("/kaggle/input/cyprus-ai-camp-the-enigma-of-wine-quality/public_test.csv")
test2 = pd.read_csv("/kaggle/input/cyprus-ai-camp-the-enigma-of-wine-quality/private_test.csv")

subm = pd.read_csv("/kaggle/input/cyprus-ai-camp-the-enigma-of-wine-quality/sample_submission.csv")

train.shape, valid.shape, test1.shape, test2.shape, subm.shape

((1279, 12), (320, 12), (160, 12), (160, 12), (320, 2))

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1279 entries, 0 to 1278
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1279 non-null   float64
 1   volatile acidity      1279 non-null   float64
 2   citric acid           1279 non-null   float64
 3   residual sugar        1279 non-null   float64
 4   chlorides             1279 non-null   float64
 5   free sulfur dioxide   1279 non-null   float64
 6   total sulfur dioxide  1279 non-null   float64
 7   density               1279 non-null   float64
 8   pH                    1279 non-null   float64
 9   sulphates             1279 non-null   float64
 10  alcohol               1279 non-null   float64
 11  quality               1279 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 120.0 KB


In [4]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         320 non-null    float64
 1   volatile acidity      320 non-null    float64
 2   citric acid           320 non-null    float64
 3   residual sugar        320 non-null    float64
 4   chlorides             320 non-null    float64
 5   free sulfur dioxide   320 non-null    float64
 6   total sulfur dioxide  320 non-null    float64
 7   density               320 non-null    float64
 8   pH                    320 non-null    float64
 9   sulphates             320 non-null    float64
 10  alcohol               320 non-null    float64
 11  quality               320 non-null    int64  
dtypes: float64(11), int64(1)
memory usage: 30.1 KB


In [5]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    160 non-null    int64  
 1   fixed acidity         160 non-null    float64
 2   volatile acidity      160 non-null    float64
 3   citric acid           160 non-null    float64
 4   residual sugar        160 non-null    float64
 5   chlorides             160 non-null    float64
 6   free sulfur dioxide   160 non-null    float64
 7   total sulfur dioxide  160 non-null    float64
 8   density               160 non-null    float64
 9   pH                    160 non-null    float64
 10  sulphates             160 non-null    float64
 11  alcohol               160 non-null    float64
dtypes: float64(11), int64(1)
memory usage: 15.1 KB


In [6]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    160 non-null    int64  
 1   fixed acidity         160 non-null    float64
 2   volatile acidity      160 non-null    float64
 3   citric acid           160 non-null    float64
 4   residual sugar        160 non-null    float64
 5   chlorides             160 non-null    float64
 6   free sulfur dioxide   160 non-null    float64
 7   total sulfur dioxide  160 non-null    float64
 8   density               160 non-null    float64
 9   pH                    160 non-null    float64
 10  sulphates             160 non-null    float64
 11  alcohol               160 non-null    float64
dtypes: float64(11), int64(1)
memory usage: 15.1 KB


In [7]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.7,0.855,0.02,1.9,0.064,29.0,38.0,0.99472,3.3,0.56,10.75,6
1,6.9,0.63,0.33,6.7,0.235,66.0,115.0,0.99787,3.22,0.56,9.5,5
2,11.9,0.57,0.5,2.6,0.082,6.0,32.0,1.0006,3.12,0.78,10.7,6
3,8.6,0.47,0.27,2.3,0.055,14.0,28.0,0.99516,3.18,0.8,11.2,5
4,10.4,0.26,0.48,1.9,0.066,6.0,10.0,0.99724,3.33,0.87,10.9,6


In [8]:
features = [c for c in train.columns if c not in ['quality']]
target_col = 'quality'

In [9]:
df1 = valid.copy()[features]
df2 = test1.copy()[features]
df3 = test2.copy()[features]

In [10]:
df1['key'] = df1.astype(str).agg('-'.join, axis=1)
df2['key'] = df2.astype(str).agg('-'.join, axis=1)
df3['key'] = df3.astype(str).agg('-'.join, axis=1)

rows_present2 = df2['key'].isin(df1['key'])
rows_present3 = df3['key'].isin(df1['key'])

print(f'Public test data present in val.csv: {rows_present2.mean()*100}%')
print(f'Private test data present in val.csv: {rows_present3.mean()*100}%')

Public test data present in val.csv: 100.0%
Private test data present in val.csv: 100.0%


In [11]:
# MASSIVE data leak

In [12]:
answers = []

for key2 in tqdm(df2['key']):
    for i, key1 in enumerate(df1['key']):
        if key1 == key2:
            answers.append(valid.loc[i, 'quality'].item())
            break

for key3 in tqdm(df3['key']):
    for i, key1 in enumerate(df1['key']):
        if key1 == key3:
            answers.append(valid.loc[i, 'quality'].item())
            break

subm = pd.DataFrame({
    'Id': range(320),
    'quality': answers
})

subm.to_csv("submission.csv", index=False)

subm.head()

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

Unnamed: 0,Id,quality
0,0,5
1,1,6
2,2,5
3,3,6
4,4,6
