In [1]:
import numpy as np
import pandas as pd


In [2]:
dataset_url = 'winequality-red.csv'
data = pd.read_csv(dataset_url)

In [3]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


data.isnull().sum()

In [4]:
data.shape

(1599, 12)

In [5]:
# plt.hist(data['quality'])
# plt.show()

In [6]:
data.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1594,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
1595,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
1598,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


In [7]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [9]:
x = data.iloc[:,[1,2,4,9,10]].values
y = data.iloc[:,11].values

In [10]:
x

array([[ 0.7  ,  0.   ,  0.076,  0.56 ,  9.4  ],
       [ 0.88 ,  0.   ,  0.098,  0.68 ,  9.8  ],
       [ 0.76 ,  0.04 ,  0.092,  0.65 ,  9.8  ],
       ...,
       [ 0.51 ,  0.13 ,  0.076,  0.75 , 11.   ],
       [ 0.645,  0.12 ,  0.075,  0.71 , 10.2  ],
       [ 0.31 ,  0.47 ,  0.067,  0.66 , 11.   ]])

In [11]:
x.shape


(1599, 5)

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state = 0)

In [13]:
x_train.shape,x_test.shape

((1199, 5), (400, 5))

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [16]:
def find_best_model(X,Y):
    models = {
        
        
        'decison_tree_classifier':{
            'model': DecisionTreeClassifier(splitter='best'),
            'parameters' :{
                'criterion' :['gini','entropy'],
                'max_depth' :[5,10]
            }
        },
        
        'random_forest': {
            'model': RandomForestClassifier(criterion= 'gini'),
            
            'parameters' :{
                'n_estimators': [1,5,10,15,20,30,40,50,60,70,80,90,100]
            }
        },
        
        'svc' : {
            'model' : SVC(gamma= 'auto'),
            
            'parameters': {
                'kernel' : ['rbf','linear'],
                'C': [1,10,20]
            }
        }
    }
    
    scores = []
    cv_shuffle = ShuffleSplit(n_splits=5,test_size= 0.2,random_state=0)
    
    for model_names,model_params in models.items():
        gc = GridSearchCV(model_params['model'],model_params['parameters'],cv = cv_shuffle,return_train_score= False)
        gc.fit(X,Y)
        scores.append({
            'model': model_names,
            'parameters' : gc.best_params_,
            'score' : gc.best_score_
        })
        
    return pd.DataFrame(scores, columns=['model','best_parameters','score'])

find_best_model(x_train, y_train)

Unnamed: 0,model,best_parameters,score
0,decison_tree_classifier,,0.5725
1,random_forest,,0.663333
2,svc,,0.5775


In [18]:
from sklearn.calibration import CalibratedClassifierCV
svm= SVC()
model = CalibratedClassifierCV(svm).fit(x_train, y_train)
# evaluate the test set
accuracy = model.score(x_test, y_test)
# log accuracy which is a single numerical value
print ("Accuracy is {}".format(accuracy))

Accuracy is 0.6075


In [19]:
lrmodel = LogisticRegression(random_state=0, solver='lbfgs').fit(x_train, y_train)
# evaluate the test set
lraccuracy = lrmodel.score(x_test, y_test)
# log accuracy which is a single numerical value
print ("Accuracy is {}".format(lraccuracy))

Accuracy is 0.6175


In [20]:
dtmodel =DecisionTreeClassifier(random_state=0).fit(x_train, y_train)
# evaluate the test set
dtaccuracy = dtmodel.score(x_test, y_test)
# log accuracy which is a single numerical value
print ("Accuracy is {}".format(dtaccuracy))

Accuracy is 0.5975


In [22]:
rfmodel = RandomForestClassifier(n_estimators= 60).fit(x_train,y_train)
rfaccuracy = rfmodel.score(x_test,y_test)
print(rfaccuracy)

0.675


In [24]:
rfmodel.fit(x_train,y_train)

RandomForestClassifier(n_estimators=60)

In [25]:
y_pred = rfmodel.predict(x_test)

In [26]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

In [27]:
cm

array([[  0,   0,   2,   0,   0,   0],
       [  0,   0,   9,   5,   0,   0],
       [  0,   1, 128,  33,   7,   0],
       [  0,   1,  35, 116,  18,   0],
       [  0,   0,   2,  14,  24,   0],
       [  0,   0,   0,   1,   4,   0]])

In [28]:
sum = 0
for i in range(5):
    for j in range(6):
        if(i == j):
            sum += cm[i][j]
            
sum / np.sum(cm)

0.67

In [30]:
import pickle
pickle.dump(rfmodel,open("Wine_quality_model.sav",'wb'))

In [31]:
file = open('model.pkl', 'wb')

# dump information to that file
pickle.dump(rfmodel, file)