# Import everything, train/test split data

In [45]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
wine = pd.read_csv("reduced_data_df.csv")

In [3]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
949,8.9,0.12,0.45,1.8,0.075,10.0,21.0,0.99552,3.41,0.76,11.9,1
591,6.6,0.39,0.49,1.7,0.07,23.0,149.0,0.9922,3.12,0.5,11.5,1
1512,6.4,0.79,0.04,2.2,0.061,11.0,17.0,0.99588,3.53,0.65,10.4,1
1535,7.0,0.55,0.13,2.2,0.075,15.0,35.0,0.9959,3.36,0.59,9.7,1
1485,7.0,0.655,0.16,2.1,0.074,8.0,25.0,0.99606,3.37,0.55,9.7,0


In [14]:
X = wine.iloc[:, 0:-1]
y = wine.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Set up

In [92]:
wine_scaler = StandardScaler()
wine_pca = PCA()
wine_xgb = xgb.XGBClassifier()
wine_RFC = RandomForestClassifier(n_estimators=100)
lr_wine = LogisticRegression(solver='liblinear')

## Pipe 1: XGBoost

In [93]:
pipe = Pipeline(steps=[('scaler', wine_scaler),
#                        ('pca', wine_pca), 
                       ('xgb', wine_xgb)])
pipe_params = {
#     'pca__n_components':[7, 8, 9, 10],
    'xgb__max_depth':[2, 3, 4],
    'xgb__n_estimators':[25, 50]
}

gs = GridSearchCV(pipe, pipe_params, cv=3)


In [94]:
gs.fit(X_train, y_train)
gs.score(X_test, y_test)

0.75

## Pipe 2: Random Forest

In [95]:
pipe2 = Pipeline(steps=[('scaler', wine_scaler), 
                        ('rfc', wine_RFC)])

pipe2_params = {
    'rfc__max_depth':[2, 3, 4, 5],
    'rfc__criterion':['gini', 'entropy'],
    'rfc__n_estimators':[100]
}

gs2 = GridSearchCV(pipe2, pipe2_params, cv=5)

In [96]:
gs2.fit(X_train, y_train);
gs3.score(X_test, y_test)

0.69

## Pipe 3: Logistic Regression

In [97]:
pipe3 = Pipeline(steps=[('scaler', wine_scaler), ('lr', lr_wine)])

pipe3_params = {
    'lr__penalty':['l1', 'l2'],
    'lr__C':[.01, .1, 1, 10, 100]
}

gs3 = GridSearchCV(pipe3, pipe3_params, cv=3)
gs3.fit(X_train, y_train);

In [98]:
gs3.score(X_test, y_test)

0.69

# Predicting Unseen Data

In [99]:
holdout_df = pd.read_csv('holdout_df.csv')

In [100]:
holdout_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1363,8.0,0.83,0.27,2.0,0.08,11.0,63.0,0.99652,3.29,0.48,9.8
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
739,9.0,0.69,0.0,2.4,0.088,19.0,38.0,0.999,3.35,0.6,9.3
1366,7.3,0.74,0.08,1.7,0.094,10.0,45.0,0.99576,3.24,0.5,9.8
901,7.4,0.635,0.1,2.4,0.08,16.0,33.0,0.99736,3.58,0.69,10.8


In [102]:
gs.fit(X, y)
gs.predict(holdout_df)

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0])

AttributeError: 'XGBClassifier' object has no attribute 'transform'