In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline


In [2]:
df=pd.read_json('../input/train.json')
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
cuisine        39774 non-null object
id             39774 non-null int64
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [4]:
import pandas_profiling
pandas_profiling.ProfileReport(df)

0,1
Number of variables,3
Number of observations,39774
Total Missing (%),0.0%
Total size in memory,932.3 KiB
Average record size in memory,24.0 B

0,1
Numeric,1
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,1

0,1
Distinct count,20
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
italian,7838
mexican,6438
southern_us,4320
Other values (17),21178

Value,Count,Frequency (%),Unnamed: 3
italian,7838,19.7%,
mexican,6438,16.2%,
southern_us,4320,10.9%,
indian,3003,7.6%,
chinese,2673,6.7%,
french,2646,6.7%,
cajun_creole,1546,3.9%,
thai,1539,3.9%,
japanese,1423,3.6%,
greek,1175,3.0%,

0,1
Distinct count,39774
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,24850
Minimum,0
Maximum,49717
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,2466.7
Q1,12398.0
Median,24887.0
Q3,37328.0
95-th percentile,47177.0
Maximum,49717.0
Range,49717.0
Interquartile range,24930.0

0,1
Standard deviation,14360
Coef of variation,0.57788
Kurtosis,-1.2047
Mean,24850
MAD,12446
Skewness,-0.0031285
Sum,988365483
Variance,206210000
Memory size,310.8 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
11663,1,0.0%,
44447,1,0.0%,
42398,1,0.0%,
48541,1,0.0%,
46492,1,0.0%,
36251,1,0.0%,
34202,1,0.0%,
38296,1,0.0%,
9614,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
49712,1,0.0%,
49713,1,0.0%,
49714,1,0.0%,
49716,1,0.0%,
49717,1,0.0%,

Unsupported value

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
f, ax = plt.subplots(figsize=(5,6))
sns.countplot(y = 'cuisine', 
                   data = df,
                  order = df.cuisine.value_counts(ascending=False).index)

<matplotlib.axes._subplots.AxesSubplot at 0x7fa436888198>

# Analysis Of Ingrediants

In [6]:
ingredients_individual = Counter([ingredient for ingredient_list in df.ingredients for ingredient in ingredient_list])
ingredients_individual = pd.DataFrame.from_dict(ingredients_individual,orient='index').reset_index()


ingredients_individual = ingredients_individual.rename(columns={'index':'Ingredient', 0:'Count'})

#Most common ingredients
sns.barplot(x = 'Count', 
            y = 'Ingredient',
            data = ingredients_individual.sort_values('Count', ascending=False).head(20))

<matplotlib.axes._subplots.AxesSubplot at 0x7fa436888198>

In [7]:
df.ingredients

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
5        [plain flour, sugar, butter, eggs, fresh ginge...
6        [olive oil, salt, medium shrimp, pepper, garli...
7        [sugar, pistachio nuts, white almond bark, flo...
8        [olive oil, purple onion, fresh pineapple, por...
9        [chopped tomatoes, fresh basil, garlic, extra-...
10       [pimentos, sweet pepper, dried oregano, olive ...
11       [low sodium soy sauce, fresh ginger, dry musta...
12       [Italian parsley leaves, walnuts, hot red pepp...
13       [ground cinnamon, fresh cilantro, chili powder...
14       [fresh parmesan cheese, butter, all-purpose fl...
15       [tumeric, vegetable stock, tomatoes, garam mas...
16       [greek yogurt, lemon curd, confectioners sugar.

In [8]:
label = df.cuisine

features = df.drop(['cuisine'], axis = 1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.20, random_state =0) 

In [10]:
train_ingredients_text = X_train.ingredients.apply(lambda s: ' '.join(w.lower() for w in s)).str.replace('[^\w\s]','')
test_ingredients_text = X_test.ingredients.apply(lambda s: ' '.join(w.lower() for w in s)).str.replace('[^\w\s]','')  

# Term Frequency, Inverse Document Frequency

In [11]:
tfidf = TfidfVectorizer(
    min_df = 3,
    max_df = 0.95,
    stop_words = 'english'
)

tfidf.fit(train_ingredients_text)
text = tfidf.transform(train_ingredients_text)
text

<31819x2070 sparse matrix of type '<class 'numpy.float64'>'
	with 594711 stored elements in Compressed Sparse Row format>

In [12]:
traintext = tfidf.transform(test_ingredients_text)

# Random Forest Classifier (Ensemble Learning)

In [13]:
clf = RandomForestClassifier(n_estimators=100, max_depth=16,random_state=0)
clf.fit(text, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Random Forest Train Data Accuracy

In [14]:
y_pred= clf.predict(traintext)

In [15]:
accuracy_score(y_test,y_pred)*100 

60.84223758642363

In [16]:
y_pred=clf.predict(text)
accuracy_score(y_train,y_pred)*100 

66.22458279644238

In [17]:
clf.score(text,y_train)

0.6622458279644238

### Random Forest Test Data Accuracy

In [18]:
clf.score(traintext,y_test)

0.6084223758642363

# Decision Tree Classifier

In [19]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=16, min_samples_leaf=5) 
clf_gini.fit(text, y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=16,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

### Decision Tree Training Data Score

In [20]:
clf_gini.score(text,y_train)

0.5851220968603664

### Decision Tree Test Data Score

In [21]:
clf_gini.score(traintext,y_test)

0.5399120050282841

## Cross Validation for Decision Tree

In [22]:
crossvalidation = df.ingredients.apply(lambda s: ' '.join(w.lower() for w in s)).str.replace('[^\w\s]','')
text1 = tfidf.transform(crossvalidation)
scores = cross_val_score(clf_gini, text1, label, cv=5)
scores

array([0.53202713, 0.5458658 , 0.54034691, 0.54357942, 0.5448597 ])

## Cross Validation for Random Forest

In [23]:
scores = cross_val_score(clf, text1, label, cv=5)
scores

array([0.59721176, 0.60982659, 0.60419809, 0.61048925, 0.60802819])

# For Testing on the test data provided by kaggle

In [24]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
# parameters = {'C': np.arange(1, 100, 5)}
model = LinearSVC()
# model = LogisticRegression(multi_class='multinomial')
# model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
# model = SVC()

model = OneVsRestClassifier(model)
# model = BaggingRegressor(model, n_estimators=100)
# model = GridSearchCV(model, parameters, n_jobs=-1, verbose=2, cv=3)

print(cross_val_score(model, text, y_train, cv=3)) 



[0.78194497 0.78108796 0.77839623]


In [25]:
model.fit(text, y_train)
model.score(traintext, y_test)

0.7927089880578253

In [26]:
df1=pd.read_json("../input/test.json")
df1.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [27]:
predicting = df1.ingredients.apply(lambda s: ' '.join(w.lower() for w in s)).str.replace('[^\w\s]','')
textpre = tfidf.transform(predicting)
predicted= model.predict(textpre)

In [28]:
print(predicted)

['british' 'southern_us' 'italian' ... 'italian' 'southern_us' 'mexican']


In [29]:
sub=pd.read_csv("../input/sample_submission.csv")
sub.head()
del sub['cuisine']
sub.head()

Unnamed: 0,id
0,35203
1,17600
2,35200
3,17602
4,17605


In [30]:
sub['cuisine']=predicted
sub.head()

Unnamed: 0,id,cuisine
0,35203,british
1,17600,southern_us
2,35200,italian
3,17602,cajun_creole
4,17605,italian


In [31]:
sub.to_csv("Submission.csv",index=False)
