In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import matplotlib as plt


# group white and red
df_white = pd.read_csv('winequality-white.csv', delimiter = ';')
df_red = pd.read_csv('winequality-red.csv', delimiter = ';')

# make the column names snakecase for red wine data
df_red['type'] = 1
cols_red = df_red.columns.tolist()
cols_red = [col.lower().replace(' ','_') for col in cols_red]
df_red.columns = cols_red

# make the column names snakecase for white wine data
df_white['type'] = 0
cols_white = df_white.columns.tolist()
cols_white = [col.lower().replace(' ','_') for col in cols_white]
df_white.columns = cols_white

#combine dataframes
combined_df = df_red.append(df_white)

#get x and y values
X = combined_df[['alcohol', 'residual_sugar', 'volatile_acidity']].values
y = combined_df['type'].values

# train test split
X_train, X_test, y_train, y_test = tts(X, y)

logistic_model = LogisticRegression(solver='liblinear', max_iter=100)
logistic_model.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
# predict values for test set
predict = logistic_model.predict(X_test)
# compare score using auc and f1_score

#first ensure my shapes are correct
print(len(y_test))
print(len(predict))
print(predict.shape)
print(y_test.shape)

# now run f1_score to test positive predictive accuracy
print(f1_score(y_test, predict)



1625
1625
(1625,)
(1625,)
0.750741839763


An f1_score of .75 is not great, but for such a simple model with only three variables it's far better than a random guess. 


In [19]:
# Try predicting using our logistic model.
logistic_model.predict(np.array([[11.2, 2, .10], [11., 1.5, .9], [12., 6., 1]]))

array([0, 1, 1])

This means the first 'sample' is a white wine, while both the second and third are predicted as white. 

In [20]:
# logistic regression gives a probability, so we want to know
# for each of these three predictions, how likely are they.
logistic_model.predict_proba(np.array([[11.2, 2, .10], [11., 1.5, .9], [12., 6., 1]]))

array([[ 0.97123125,  0.02876875],
       [ 0.00373852,  0.99626148],
       [ 0.01004334,  0.98995666]])

The above can be interpreted with the first value being the probability of being group 0 (aka White) and the second value as probability of being group 1 (aka Red).
