In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/to-read-or-not-to-read/sample_submission.csv
/kaggle/input/to-read-or-not-to-read/train.csv
/kaggle/input/to-read-or-not-to-read/test.csv


In [2]:
#Loading Dataset
data = pd.read_csv('/kaggle/input/to-read-or-not-to-read/train.csv') 

In [3]:
#Importing required library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re


In [4]:
#to download the necessary data files for the NLTK library, specifically the tokenization models.
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
#Stopwords
#used for stemming words in natural language processing.
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [6]:
#Removing URL, HTML Tags, Non-Alphabetic & Stopwords.
def clean_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^A-Za-z]', ' ', text)  # Remove non-alphabetic characters
    text = ' '.join([ps.stem(word) for word in word_tokenize(text) if word.lower() not in stop_words])  # Remove stopwords and apply stemming
    return text

In [None]:
#Calling The clean_text function on dataset.
data['cleaned_review'] = data['review_text'].apply(clean_text)

In [None]:
data.head()

In [None]:
#Taking the required data.
data=data[['user_id','cleaned_review','rating','n_votes','n_comments']]


In [None]:
data.head()

In [None]:
#Splitting the data for train and test
train_data, test_data, train_labels, test_labels = train_test_split(
    data[['user_id','cleaned_review','n_votes','n_comments']], data['rating'], test_size=0.1, random_state=20
)#0.2 #42
##new

In [None]:
# text_features = 'cleaned_review'
# user_id_feature = 'user_id'
text_features = 'cleaned_review'
user_id_feature = 'user_id'
votes_feature = 'n_votes'
comments_feature = 'n_comments'
##new

In [None]:
 #ColumnTransformer is a powerful tool for applying different transformations to different columns in a dataset
#-----------------------------------------------------------------------------------------------------------------------------------

#TfidfVectorizer. This is a text vectorizer that converts a collection of raw documents to a matrix of TF-IDF features
#------------------------------------------------------------------------------------------------------------------------------------

# The most popular approach is using the Term Frequency-Inverse Document Frequency (TF-IDF) technique.

# Term Frequency (TF) = (Number of times term t appears in a document)/(Number of terms in the document)

# Inverse Document Frequency (IDF) = log(N/n), where, N is the number of documents and n is the number of documents a 
#term t has appeared in. The IDF of a rare word is high, whereas the IDF of a frequent word is likely to be low. 
#Thus having the effect of highlighting words that are distinct.

# We calculate TF-IDF value of a term as = TF * IDF
#------------------------------------------------------------------------------------------------------------------------------------------------------------

# One-hot encoding is a process used to convert categorical data, represented as integer labels, 
# into a binary matrix where each category is represented by a binary column.

In [None]:
#Applying the pipeline
from sklearn.preprocessing import OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=25000, ngram_range=(1, 2)), text_features),
        ('user_id', OneHotEncoder(handle_unknown='ignore'), [user_id_feature]) , # One-hot encode 'user_id'
        ('votes', 'passthrough', [votes_feature]),
        ('comments', 'passthrough', [comments_feature])   
    ]
)


In [None]:
# a pipeline is a way to streamline a lot of routine processes, which can be particularly useful for machine learning workflows. 
# A pipeline bundles together a sequence of data processing steps and a model into a single object. 
# This ensures that the entire workflow, including data preprocessing and model training, can be treated as a single unit.

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=7000, random_state=42, multi_class='multinomial'))  # Logistic Regression for multiclass classification
])#1000 #42

In [None]:
#Training The Model using train data
pipeline.fit(train_data, train_labels)

In [None]:
#Testing the model using 20% Test data
predictions = pipeline.predict(test_data)

In [None]:
#Looking for the accuracy score and classification report
accuracy = accuracy_score(test_labels, predictions)
classification_rep = classification_report(test_labels, predictions)

In [None]:
#Printing the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_rep)

In [None]:
#Loading the given test data
given_test_data = pd.read_csv('/kaggle/input/to-read-or-not-to-read/test.csv', usecols=['user_id','review_text','n_votes','n_comments','review_text'])

In [None]:
#Cleaning the data
given_test_data['cleaned_review'] = given_test_data['review_text'].apply(clean_text)

In [None]:
test_text_features = 'cleaned_review'
test_user_id_feature = 'user_id'
test_votes_feature = 'n_votes'
test_comments_feature = 'n_comments'

In [None]:
given_test_data

In [None]:
given_test_data = given_test_data[['user_id','cleaned_review','n_votes','n_comments']]

In [None]:
#Predicting the label for given test data.
test_predictions=pipeline.predict(given_test_data)

In [None]:
#Loading the sample submission
submission = pd.read_csv('/kaggle/input/to-read-or-not-to-read/sample_submission.csv')

In [None]:
#Replacing the rating column with our prediction 
submission['rating']=test_predictions

In [None]:
#Saving the csv.
submission.to_csv('submission.csv',index=False)