<a href="https://colab.research.google.com/github/a-git-b/PredictFoodRatingsUsingML/blob/master/21f1003220_notebook_t12024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'recipe-for-rating-predict-food-ratings-using-ml:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F67079%2F7452256%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240705%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240705T185241Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D85ed46996eff2af5a91780758f014ec81c2c882d4092df44f5efbc1b07425b0b0dd56b3bd53d8df1e44d7d90104b6fad2cdffb2fa708f3ac7fe6b5596a3610cbde13b89e467fcece710aa3ef2a496660b15845a3c77d13c3dea9896e72e4ce5e17317c2441473c2093d7a89b9b39017fa570a2889237474cb7476c81f6ad053ce965524a580eb291857b45679013d68d04fb068ec6ac832fada3a3af032607fefdc19593633dfbe087b799b72a12aba20704e443b3f2cf7279a521f0377093d3c3699c21029d764166aca2ae91da9b415cd1e52e99148dce29b46a908f81c886bf95b0840195a069c89ab1c77460eb639fce5248d4960db4f1f1472612d954d3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading recipe-for-rating-predict-food-ratings-using-ml, 2426532 bytes compressed
Downloaded and uncompressed: recipe-for-rating-predict-food-ratings-using-ml
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/sample.csv
/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv
/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.svm import SVC
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE

## Loading Data

In [3]:
# training data
train = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')

In [None]:
# test data
test = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv')

In [None]:
train.head()

In [None]:
train.shape

## Handling Missing Values

In [None]:
train.isnull().sum()

In [None]:
train = train.dropna()

In [None]:
train.isnull().sum()

In [None]:
train.shape

## Exploring Data

In [None]:
train['RecipeNumber'].value_counts()

In [None]:
train['RecipeCode'].value_counts()

In [None]:
train['Rating'].value_counts()

In [None]:
sns.countplot(train,x='Rating')

In [None]:
train[train['Rating']==0]

In [None]:
sel_features = ['UserReputation','ReplyCount','ThumbsUpCount','ThumbsDownCount','BestScore']

In [None]:
train.shape

In [None]:
train[sel_features+['Rating']].corr()

In [None]:
sns.heatmap(train[sel_features+['Rating']].corr())

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.isnull().sum()

In [None]:
test_sel = test[sel_features]

In [None]:
test_sel.head()

In [None]:
Y = train['Rating']

In [None]:
Y.shape

In [None]:
X = train[sel_features+['Recipe_Review']]

In [None]:
X.shape

In [None]:
X.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
X_train.shape

In [None]:
y_train.shape

## Text Preprocessing

In [None]:
def preprocess_text_data(values):
    preprocessed_text=[]
    for sen in values:
        sen=str(sen).lower()
        # removing special characters
        sen=sen.replace("'s",' is').replace("'",'').replace('!','').replace(',',' ').replace('-',' ').replace('.','')
        sen = sen.replace('\r','').replace('\n','').replace('\"','').replace('&','').replace('=','').replace('?','').replace(':','').replace("'re",' are')
        sen = sen.replace("'ve",' have').replace("'m",' am').replace("'t",' not').replace("doesnt","does not").replace("wasnt",'was not').replace("didnt",'did not').replace("…",' ').replace("….",' ').replace('\\r','').replace('\\n','')
        sen = sen.replace('\\','').replace('*','').replace("'",'').replace(";",'').replace("+",'').replace("%",'').replace(" ",' ')
        # except a-z and A-Z remove all things from sentence.
        sen = re.sub('[^a-zA-Z \n\.]', '', sen)

        preprocessed_text.append(sen.strip())
    return preprocessed_text

In [None]:
X_train['Recipe_Review'] = preprocess_text_data(X_train['Recipe_Review'])

In [None]:
tfidf = TfidfVectorizer(stop_words="english",min_df=70)

rev_train = tfidf.fit_transform(X_train['Recipe_Review'])

In [None]:
tfidf.vocabulary_.keys()


In [None]:
rev_train.shape

In [None]:
rev_test = tfidf.transform(X_test['Recipe_Review'])

In [None]:
rev_test.shape

In [None]:
rev_predict = tfidf.transform(test['Recipe_Review'])

In [None]:
rev_predict.shape

## Data Scaling

In [None]:
mms = MinMaxScaler()

In [None]:
X_train_normalized = pd.DataFrame(mms.fit_transform(X_train[sel_features]))

In [None]:
X_train_normalized.describe()

In [None]:
X_test_normalized = pd.DataFrame(mms.transform(X_test[sel_features]))

In [None]:
X_predict_normalized = pd.DataFrame(mms.transform(test[sel_features]))

In [None]:
X_rec_train = np.hstack((rev_train.toarray(),X_train_normalized))

In [None]:
X_rec_test = np.hstack((rev_test.toarray(),X_test_normalized))

In [None]:
X_rec_predict = np.hstack((rev_predict.toarray(),X_predict_normalized))

In [None]:
X_rec_train.shape

In [None]:
X_rec_test.shape

## Applying Smote

In [None]:
np.unique(y_train,return_counts=True)

In [None]:
sm = SMOTE(random_state=42)

In [None]:
X_train_smote, y_train_smote = sm.fit_resample(X_rec_train,y_train)

In [None]:
np.unique(y_train_smote,return_counts=True)

In [None]:
LG = LogisticRegression()

In [None]:
LG.fit(X_train_smote,y_train_smote)

In [None]:
## predicting actual training data
y_predict = LG.predict(X_train_smote)
accuracy_score(y_train_smote,y_predict)

In [None]:
# predicting test data
y_predict = LG.predict(X_rec_test)
accuracy_score(y_test,y_predict)

In [None]:
## Showing underfitting after applying smote

## Without Smote

In [None]:
LG1 = LogisticRegression()

In [None]:
LG1.fit(X_rec_train,y_train)

In [None]:
## predicting actual training data
y_predict = LG1.predict(X_rec_train)
accuracy_score(y_train,y_predict)

In [None]:
## predicting training data
y_predict = LG1.predict(X_rec_test)
accuracy_score(y_test,y_predict)

## Hyperparameter Tuning

In [None]:
LG2 = LogisticRegression(max_iter=10000,class_weight="balanced")

In [None]:
cross_score_LG2 = cross_val_score(LG2,X_rec_train,y_train,scoring='accuracy',cv=5)

In [None]:
cross_score_LG2.mean()

In [None]:
LG3 = LogisticRegression(solver='liblinear',max_iter=10000,class_weight="balanced")

In [None]:
cross_score_LG3 = cross_val_score(LG3,X_rec_train,y_train,scoring='accuracy',cv=5)

In [None]:
cross_score_LG3.mean()

In [None]:
LG4 = LogisticRegression(solver='liblinear',max_iter=10000)

In [None]:
cross_score_LG4 = cross_val_score(LG4,X_rec_train,y_train,scoring='accuracy',cv=5)

In [None]:
cross_score_LG4.mean()

In [None]:
## LG4 is better

## Training Another Models


In [None]:
dtc = DecisionTreeClassifier(max_depth=3)

In [None]:
dtc.fit(X_rec_train,y_train)

In [None]:
y_predict_train = dtc.predict(X_rec_train)
accuracy_score(y_train,y_predict_train)

In [None]:
y_predict = dtc.predict(X_rec_test)
accuracy_score(y_test,y_predict)

In [None]:
rfc = RandomForestClassifier(class_weight="balanced")

In [None]:
rfc.fit(X_rec_train,y_train)

In [None]:
y_predict_train = rfc.predict(X_rec_train)
accuracy_score(y_train,y_predict_train)

In [None]:
y_predict = rfc.predict(X_rec_test)
accuracy_score(y_test,y_predict)

In [None]:
sgd = SGDClassifier(loss='hinge')

In [None]:
sgd.fit(X_rec_train,y_train)

In [None]:
y_predict_train = sgd.predict(X_rec_train)
accuracy_score(y_train,y_predict_train)

In [None]:
y_predict = sgd.predict(X_rec_test)
accuracy_score(y_test,y_predict)

In [None]:
svc = SVC(gamma="auto")

In [None]:
svc.fit(X_rec_train,y_train)

In [None]:
y_predict_train = svc.predict(X_rec_train)
accuracy_score(y_train,y_predict_train)

In [None]:
y_predict_test = svc.predict(X_rec_test)
accuracy_score(y_test,y_predict_test)

In [None]:
xgbc = xgb.XGBClassifier()

In [None]:
xgbc.fit(X_rec_train,y_train)

In [None]:
y_predict_train = xgbc.predict(X_rec_train)
accuracy_score(y_train,y_predict_train)

In [None]:
y_predict_test = xgbc.predict(X_rec_test)
accuracy_score(y_test,y_predict_test)

In [None]:
y_prediction = xgbc.predict(X_rec_predict)

In [None]:
y_prediction.shape

In [None]:
submission=pd.DataFrame(columns=['ID','Rating'])
submission['ID']=[i for i in range(1,len(y_prediction)+1)]
submission['Rating']=y_prediction
submission.to_csv('submission.csv',index=False)