In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [6]:
# Ensure that NLTK resources are downloaded
nltk.download('stopwords')

# Load stopwords from NLTK
stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91970\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Lemmatizer function
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [4]:
# Define the file paths
training_path = r'C:\Users\91970\Downloads\final-competition-fall2023 (1)\foods_training.csv'
testing_path = r'C:\Users\91970\Downloads\final-competition-fall2023 (1)\foods_testing.csv'
sample_submission_path = r'C:\Users\91970\Downloads\final-competition-fall2023 (1)\sample_submission.csv'


In [7]:
try:
    # Load the training data
    df = pd.read_csv(training_path, sep=',', encoding='ISO-8859-1', quotechar='"', nrows=10000)
    print("Training data loaded successfully.")

    # Load the testing data
    testing_data = pd.read_csv(testing_path, sep=',', encoding='ISO-8859-1', quotechar='"')
    print("Testing data loaded successfully.")
except Exception as e:
    print("An error occurred:", e)


Training data loaded successfully.
Testing data loaded successfully.


In [9]:
# Select only the required fields from both datasets and explicitly handle potential copies
data = df[['helpfulness', 'summary', 'text', 'score_level']].copy()
testing_data = testing_data[['ID', 'helpfulness', 'summary', 'text']].copy()
print("Data copied for preprocessing.")

Data copied for preprocessing.


In [11]:
# Preprocess and combine 'summary' and 'text' columns
data['summary'] = data['summary'].fillna('').apply(lemmatize_text)
data['text'] = data['text'].fillna('').apply(lemmatize_text)
data['full_text'] = data['summary'] + ' ' + data['text']
testing_data['summary'] = testing_data['summary'].fillna('').apply(lemmatize_text)
testing_data['text'] = testing_data['text'].fillna('').apply(lemmatize_text)
testing_data['full_text'] = testing_data['summary'] + ' ' + testing_data['text']
print("Preprocessing complete.")


Preprocessing complete.


In [12]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)

In [14]:
# Define the pipeline
pipeline = Pipeline([
('tfidf', tfidf_vectorizer),
('clf', RandomForestClassifier(random_state=42))
    ])
print(pipeline)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('clf', RandomForestClassifier(random_state=42))])


In [16]:
# Define the parameter grid
param_grid = {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [None, 10, 20],
        'clf__min_samples_leaf': [1, 2],
        'clf__min_samples_split': [2, 5]
    }
print(param_grid)

{'clf__n_estimators': [100, 200], 'clf__max_depth': [None, 10, 20], 'clf__min_samples_leaf': [1, 2], 'clf__min_samples_split': [2, 5]}


In [None]:
# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(data['full_text'], data['score_level'])
print("Grid search complete.")

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:
# Best model
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


In [None]:
# Evaluate on training set
y_pred = best_model.predict(data['full_text'])
print("Training Accuracy: {:.2f}".format(accuracy_score(data['score_level'], y_pred)))
print(classification_report(data['score_level'], y_pred))

In [None]:
 # Prepare testing data and predictions
y_test_pred = best_model.predict(testing_data['full_text'])
predictions_df = pd.DataFrame({'ID': testing_data['ID'], 'predicted_score_level': y_test_pred})
predictions_df.to_csv(sample_submission_path, index=False)
print("Predictions saved to file.")
except Exception as e:
    print("An error occurred:", e)
 