In [6]:
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
%matplotlib inline

In [7]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')
# Dropping the null values
data.dropna(inplace=True,axis=1)
data.head()

Unnamed: 0,title,rating,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,30 days of groceries,advance prep required,alabama,alaska,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Trial 1 Modelling SVR with all the features but without null values

In [3]:
# Modelling SVR
svr = SVR()
X = data.drop(['rating', 'title'], 1)
Y = data.rating
svr.fit(X,Y)
svr.score(X, Y)

0.038565706512988962

Removing the null values but keeping the nutritional information features helped a little bit in the accuracy of the model when compared to the model in the unit lesson which was will null values. But still the model is very poor.

## Trial 2 - Adding a feature for the outcome variable which summarises the rating

In [8]:
# Adding a new feature
# Based on a decision that rating lower than 2.5 considered bad and greater considered good
data['rating_good']=np.where(data['rating'] > 2.5, 1, 0)

# Modelling SVR
svr = SVR()
X = data.drop(['rating', 'title'], 1)
Y = data.rating
svr.fit(X,Y)
svr.score(X, Y)

0.78487307641425386

Adding a single feature changed the model accuracy from 0.03 to 0.78 accuracy.

## Trial 3 - Reducing the number of features by PCA

In [None]:
Y = data.rating
for n in range (10, 160, 50):
    pca = PCA(n_components=n)
    X = data.drop(['rating', 'title'], 1)
    pca.fit(X)
    X=pca.transform(X)
    svr = SVR()
    svr.fit(X,Y)
    score=cross_val_score(svr, X, Y, cv=5)
    print ('The no. of PCA components %d and accuracy is  %0.2f (+/- %0.2f)' % (n, score.mean(), score.std() * 2) )

The no. of PCA components 10 and accuracy is  0.42 (+/- 0.01)
The no. of PCA components 60 and accuracy is  0.78 (+/- 0.01)
The no. of PCA components 110 and accuracy is  0.78 (+/- 0.01)


The PCA could help to quite an extent in improving the model from 43% accuracy to 78%. Basically it could equal the accuracy of modelling of 676 features with as little as 60 features i.e reduction or cutdown by 100 times in the number of features.

## Trial 3  - Changing the model from Regression to Classifier

We can change the SV Regression to classifier since the rating is a classification problem.

We can change the outcome variable, rating into a binary classifier rather than multiple classifier to make the model simple and accurate.

In [None]:
X = data.drop(['rating', 'rating_good', 'title'], 1)
Y = data.rating_good
svc = SVC()
svc.fit(X,Y)
score=cross_val_score(svc, X, Y, cv=5)
print ('accuracy is  %0.2f (+/- %0.2f)' % (score.mean(), score.std() * 2) )

accuracy is  0.87 (+/- 0.00)


Converting the model from Regression to Classifier improved model from 78% to 87%.

## Trial 4 - Feature selection brings down to 30 most valuable features

Using Feature selection methods to bring down the features to the 30 most value features

In [9]:
# Using selectKBest feature selection method to select important features
X = data.drop(['rating', 'rating_good', 'title'], 1)
Y = data.rating_good

# feature extraction
test = SelectKBest(score_func=chi2, k=30)
fit = test.fit(X, Y)
# summarize scores and select the best 30 columns or features
np.set_printoptions(precision=3)
dic=dict()
for i, name in enumerate(X.columns):
    dic[name]=fit.scores_[i]
cols_selected=sorted(dic, key=dic.__getitem__, reverse=True)
print ('The most important columns are', cols_selected[0:29])

X=data[cols_selected[0:29]]
svc = SVC()
svc.fit(X,Y)
score=cross_val_score(svc, X, Y, cv=5)
print ('\nAccuracy is  %0.2f (+/- %0.2f)' % (score.mean(), score.std() * 2))

The most important columns are ['drink', 'alcoholic', 'house & garden', 'gin', 'cocktail', 'spirit', 'cocktail party', 'bitters', 'harpercollins', 'liqueur', 'sauté', 'non-alcoholic', 'bon appétit', 'rum', 'condiment', 'bake', 'fall', 'brandy', 'créme de cacao', 'weelicious', 'fortified wine', 'roast', 'peanut free', 'quick & easy', 'soy free', 'pernod', 'winter', 'fruit juice', 'chartreuse']

Accuracy is  0.88 (+/- 0.00)


By bringing the features down to 30 from 676 features the accuracy of the model improved from 78% when it was 676 features to 88% with 30 features......and the also the run time and memory burden has reduced quite a bit.

In [10]:
X = data.drop(['rating', 'rating_good', 'title'], 1)
Y = data.rating_good

## Trial 5 -  Since PCA and SV Classifier separately increased accuracy.....tried them in combination

In [None]:
Y = data.rating_good
for n in range (1, 50, 10):
    pca = PCA(n_components=n)
    X = data.drop(['rating', 'rating_good', 'title'], 1)
    pca.fit(X)
    X=pca.transform(X)
    svc = SVC()
    svc.fit(X,Y)
    score=cross_val_score(svc, X, Y, cv=5)
    print ('The no. of PCA components %d and accuracy is  %0.2f (+/- %0.2f)' % (n, score.mean(), score.std() * 2) )

The no. of PCA components 1 and accuracy is  0.87 (+/- 0.00)
The no. of PCA components 11 and accuracy is  0.87 (+/- 0.00)
The no. of PCA components 21 and accuracy is  0.87 (+/- 0.00)
The no. of PCA components 31 and accuracy is  0.87 (+/- 0.00)
The no. of PCA components 41 and accuracy is  0.87 (+/- 0.00)


With as less as PCA components of even one component an accuracy of 87% could be achieved i.e reduction of features from 676 to one componnent with higher accuracy.

## Trial 5 -  Since feature selection and PCA both had a positive effect, combined the PCA on selected features by method.

In [12]:
# Using selectKBest feature selection method to select important features

print ('The most important columns are', cols_selected[0:29])

for n in range (1, 20, 5):
    pca = PCA(n_components=n)
    X=data[cols_selected[0:29]]
    pca.fit(X)
    X=pca.transform(X)
    svc = SVC()
    svc.fit(X,Y)
    score=cross_val_score(svc, X, Y, cv=5)
    print ('The no. of PCA components %d and accuracy is  %0.2f (+/- %0.2f)' % (n, score.mean(), score.std() * 2) )

The most important columns are ['drink', 'alcoholic', 'house & garden', 'gin', 'cocktail', 'spirit', 'cocktail party', 'bitters', 'harpercollins', 'liqueur', 'sauté', 'non-alcoholic', 'bon appétit', 'rum', 'condiment', 'bake', 'fall', 'brandy', 'créme de cacao', 'weelicious', 'fortified wine', 'roast', 'peanut free', 'quick & easy', 'soy free', 'pernod', 'winter', 'fruit juice', 'chartreuse']
The no. of PCA components 1 and accuracy is  0.87 (+/- 0.00)
The no. of PCA components 6 and accuracy is  0.87 (+/- 0.00)
The no. of PCA components 11 and accuracy is  0.88 (+/- 0.00)
The no. of PCA components 16 and accuracy is  0.88 (+/- 0.00)


The accuracy could not be improved beyond 87% by SVC.