In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [None]:
train_data.info()

In [None]:
train_data.shape

In [None]:
train_data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_data.drop('RecipeNumber',axis=1,inplace=True)
train_data.drop('CommentID',axis=1,inplace=True)
train_data.drop('UserReputation',axis=1,inplace=True)
train_data.drop('ReplyCount',axis=1,inplace=True)
train_data.drop('ThumbsUpCount',axis=1,inplace=True)
train_data.drop('ThumbsDownCount',axis=1,inplace=True)
train_data.drop('BestScore',axis=1,inplace=True)
train_data.drop('CreationTimestamp',axis=1,inplace=True)
train_data.drop('UserName',axis=1,inplace=True)

In [None]:
# train_data.drop('RecipeName',axis=1,inplace=True)

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.head()

In [None]:
train_data.isnull().sum()

In [None]:
train_data.dropna(subset=['ID','RecipeCode','RecipeName','UserID','Recipe_Review', 'Rating'], inplace=True)

In [None]:
train_data.isnull().sum()

In [None]:
train_data['Rating'].value_counts()

In [None]:
test_data = pd.read_csv('/content/test.csv')

In [None]:
test_data.info()

In [None]:
test_data.drop('RecipeNumber',axis=1,inplace=True)
test_data.drop('CommentID',axis=1,inplace=True)
test_data.drop('UserName',axis=1,inplace=True)
test_data.drop('UserReputation',axis=1,inplace=True)
test_data.drop('CreationTimestamp',axis=1,inplace=True)
test_data.drop('ReplyCount',axis=1,inplace=True)
test_data.drop('ThumbsUpCount',axis=1,inplace=True)
test_data.drop('ThumbsDownCount',axis=1,inplace=True)
test_data.drop('BestScore',axis=1,inplace=True)

In [None]:
# test_data.drop('RecipeName',axis=1,inplace=True)

In [None]:
test_data.info()

In [None]:
test_data.isnull().sum()

In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
cat_cols = train_data.select_dtypes(include='object').columns
print(cat_cols)

In [None]:
num_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
print(num_cols)

In [None]:
for col in num_cols:
    plt.figure(figsize=(8, 6))
    plt.hist(train_data[col], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
for col in num_cols:
    plt.figure(figsize=(8, 6))
    plt.boxplot(train_data[col], patch_artist=True, showmeans=True)
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.ylabel('Value')
    plt.grid(True)
    plt.show()

In [None]:
train_data.hist(bins = 100, edgecolor = 'lime', figsize=(14,8))

In [None]:
y = train_data['Rating']

In [None]:
train_data.drop(columns=['Rating'], inplace=True)

In [None]:
num_cols = num_cols.drop('Rating')

In [None]:
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])


pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

train_data_processed = pipeline.fit_transform(train_data)

In [None]:
test_data_processed = pipeline.transform(test_data)

In [None]:
correlation = train_data['ID'].corr(y)
print("Correlation between ID and Rating:", correlation)
correlation = train_data['RecipeCode'].corr(y)
print("Correlation between RecipeCode and Rating:", correlation)

# correlation = train_data['RecipeName'].corr(y)
# print("Correlation between RecipeName and Rating:", correlation)
# correlation = train_data['UserID'].corr(y)
# print("Correlation between UserID and Rating:", correlation)
# correlation = train_data['Recipe_Review'].corr(y)
# print("Correlation between Recipe_Review and Rating:", correlation)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
X = train_data_processed

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

# Step 7: Evaluate the model's performance on the validation set
# Get the best estimator from the grid search
best_rf_classifier = grid_search.best_estimator_

# Predict ratings on the validation set
y_pred_val = best_rf_classifier.predict(X_val)

In [None]:
accuracy = accuracy_score(y_val, y_pred_val)
print("Accuracy on validation set:", accuracy)

In [None]:
print(best_rf_classifier)

In [None]:
y_pred_test_rf = best_rf_classifier.predict(test_data_processed)

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Define the pipeline
pipeline = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=1000))

# Define the parameter grid
param_grid = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
}

# Define the number of folds for cross-validation
num_folds = [5, 10, 15]

for folds in num_folds:
    print(f"\nNumber of Folds: {folds}\n")

    # Perform cross-validation with hyperparameter tuning
    grid_search = GridSearchCV(pipeline, param_grid, cv=KFold(n_splits=folds, shuffle=True, random_state=42), verbose=2, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Get the best estimator from the grid search
    best_logistic_regression = grid_search.best_estimator_

    # Predict ratings on the validation set
    y_pred_val = best_logistic_regression.predict(X_val)

    # Print the best hyperparameters
    print("Best hyperparameters:", grid_search.best_params_)

    # Print the accuracy score
    accuracy = accuracy_score(y_val, y_pred_val)
    print("Accuracy:", accuracy)

In [None]:
# Predict ratings on the test set
y_pred_test_lr = best_logistic_regression.predict(test_data_processed)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Define the KNN classifier
knn_classifier = KNeighborsClassifier()

# Define the parameter grid
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
}

# Different values of cross-validation
cv_values = [5, 10, 15]

for cv in cv_values:
    print(f"\nNumber of Folds: {cv}\n")

    # Perform grid search with cross-validation
    grid_search_knn = GridSearchCV(knn_classifier, param_grid_knn, cv=cv, scoring='accuracy', verbose=2, n_jobs=-1)

    # Fit the grid search to the data
    grid_search_knn.fit(X_train, y_train)

    # Get the best KNN model
    best_knn_model = grid_search_knn.best_estimator_

    # Print the best hyperparameters
    print("Best hyperparameters for KNN:", grid_search_knn.best_params_)

    # Predict ratings on the validation set
    y_pred_val_knn = best_knn_model.predict(X_val)

    # Calculate accuracy
    accuracy_knn = accuracy_score(y_val, y_pred_val_knn)
    print("Accuracy for KNN:", accuracy_knn)

In [None]:
# Predict ratings on the test set
y_pred_test_knn = best_knn_model.predict(test_data_processed)

In [None]:
from sklearn.svm import SVC

# Define the SVM classifier
svm_classifier = SVC()

# Define the parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# Different values of cross-validation
cv_values = [5, 10, 15]

for cv in cv_values:
    print(f"\nNumber of Folds: {cv}\n")

    # Perform grid search with cross-validation
    grid_search_svm = GridSearchCV(svm_classifier, param_grid_svm, cv=cv, scoring='accuracy', verbose=2, n_jobs=-1)

    # Fit the grid search to the data
    grid_search_svm.fit(X_train, y_train)

    # Get the best SVM model
    best_svm_model = grid_search_svm.best_estimator_

    # Print the best hyperparameters
    print("Best hyperparameters for SVM:", grid_search_svm.best_params_)

    # Predict ratings on the validation set
    y_pred_val_svm = best_svm_model.predict(X_val)

    # Calculate accuracy
    accuracy_svm = accuracy_score(y_val, y_pred_val_svm)
    print("Accuracy for SVM:", accuracy_svm)

In [None]:
# Predict ratings on the test set
y_pred_test_svm = best_svm_model.predict(test_data_processed)

In [None]:
# Define the number of folds for cross-validation
num_folds = [4, 8, 12, 15]

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Define the Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Define the parameter grid
param_grid_dt = {
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the number of folds for cross-validation
num_folds = [4, 8, 12, 15]

for folds in num_folds:
    print(f"\nNumber of Folds: {folds}\n")

    # Perform cross-validation with hyperparameter tuning
    grid_search_dt = GridSearchCV(dt_classifier, param_grid_dt, cv=folds, verbose=2, scoring='accuracy')

    # Fit the model
    grid_search_dt.fit(X_train, y_train)

    # Get the best estimator from the grid search
    best_dt_classifier = grid_search_dt.best_estimator_

    # Predict ratings on the validation set
    y_pred_val_dt = best_dt_classifier.predict(X_val)

    # Print the best hyperparameters
    print("Best hyperparameters:", grid_search_dt.best_params_)

    # Print the accuracy score
    accuracy_dt = accuracy_score(y_val, y_pred_val_dt)
    print("Accuracy:", accuracy_dt)

In [None]:
# Predict ratings on the test set
y_pred_test_dt = best_dt_classifier.predict(test_data_processed)

In [None]:
from sklearn.ensemble import BaggingClassifier

# Define the base classifier for Bagging
base_classifier = DecisionTreeClassifier()

# Define the Bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_classifier)

# Define the parameter grid
param_grid_bagging = {
    'n_estimators': [10, 20, 30],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

for folds in num_folds:
    print(f"\nNumber of Folds: {folds}\n")

    # Perform cross-validation with hyperparameter tuning
    grid_search_bagging = GridSearchCV(bagging_classifier, param_grid_bagging, cv=folds, verbose=2, scoring='accuracy')

    # Fit the model
    grid_search_bagging.fit(X_train, y_train)

    # Get the best estimator from the grid search
    best_bagging_classifier = grid_search_bagging.best_estimator_

    # Predict ratings on the validation set
    y_pred_val_bagging = best_bagging_classifier.predict(X_val)

    # Print the best hyperparameters
    print("Best hyperparameters:", grid_search_bagging.best_params_)

    # Print the accuracy score
    accuracy_bagging = accuracy_score(y_val, y_pred_val_bagging)
    print("Accuracy:", accuracy_bagging)

In [None]:
# Predict ratings on the test set
y_pred_test_bagging = best_bagging_classifier.predict(test_data_processed)

In [None]:
from sklearn.neural_network import MLPClassifier

# Define the MLP classifier
mlp_classifier = MLPClassifier()

# Define the parameter grid
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

for folds in num_folds:
    print(f"\nNumber of Folds: {folds}\n")

    # Perform cross-validation with hyperparameter tuning
    grid_search_mlp = GridSearchCV(mlp_classifier, param_grid_mlp, cv=folds, verbose=2, scoring='accuracy')

    # Fit the model
    grid_search_mlp.fit(X_train, y_train)

    # Get the best estimator from the grid search
    best_mlp_classifier = grid_search_mlp.best_estimator_

    # Predict ratings on the validation set
    y_pred_val_mlp = best_mlp_classifier.predict(X_val)

    # Print the best hyperparameters
    print("Best hyperparameters:", grid_search_mlp.best_params_)

    # Print the accuracy score
    accuracy_mlp = accuracy_score(y_val, y_pred_val_mlp)
    print("Accuracy:", accuracy_mlp)

In [None]:
# Predict ratings on the test set
y_pred_test_mlp = best_mlp_classifier.predict(test_data_processed)

In [None]:
# Models and their accuracy scores
# RandomForestClassifier Model - 0.77298
# LogisticRegression Model - 0.76748
# KNN Model - 0.76088
# SVM Model - 0.76638
# DecisionTree Model - 0.76440
# RandomForestClassifier Model (without RecipeName column) - 0.76880

In [None]:
submission = pd.DataFrame(columns=["ID", "Rating"])
submission["ID"] = [i for i in range(1,len(y_pred_test_rf)+1)]
submission["Rating"] = y_pred_test_rf

# Save the submission file
submission.to_csv('submission.csv', index=False)