In [27]:
import pandas as pd

# Load the dataset, handling potential errors
file_path = '/content/amazon.csv'
data = pd.read_csv(file_path, on_bad_lines='warn')  # Warn about bad lines instead of raising an error

# Display the first few rows and the structure of the dataset
print(data.head())
print(data.info())

# Remove rows with any null values
data = data.dropna()

# Alternatively, to remove columns with any null values, uncomment the following line:
# data = data.dropna(axis=1)

# Check the modified dataset
print(data.head())
print(data.info())


   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%  

Skipping line 341: expected 16 fields, saw 23
Skipping line 651: expected 16 fields, saw 31
Skipping line 933: expected 16 fields, saw 25
Skipping line 1266: expected 16 fields, saw 28
Skipping line 1576: expected 16 fields, saw 31



In [28]:
import pandas as pd

def preprocess_text(text):
    """
    This function preprocesses a single text entry.

    For this example, we'll just lowercase the text if it is a string.
    Replace this with your actual preprocessing steps.
    """
    if isinstance(text, str):  # Check if the text is a string
        return text.lower()
    else:
        return ""  # Or handle non-string values appropriately

# Load the dataset, handling potential errors
file_path = '/content/amazon.csv'
data = pd.read_csv(file_path, on_bad_lines='warn')  # Warn about bad lines instead of raising an error

# Handling null values in 'review_content' before processing
# Option 1: Remove rows where 'review_content' is null
data = data.dropna(subset=['review_content'])

# Option 2: Fill nulls with a placeholder text such as "no review"
#data['review_content'] = data['review_content'].fillna('no review')

# Data preprocessing
data['processed_review'] = data['review_content'].apply(preprocess_text)

# Display the first few rows and the structure of the dataset to verify changes
print(data.head())
print(data.info())


   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%  

Skipping line 341: expected 16 fields, saw 23
Skipping line 651: expected 16 fields, saw 31
Skipping line 933: expected 16 fields, saw 25
Skipping line 1266: expected 16 fields, saw 28
Skipping line 1576: expected 16 fields, saw 31



In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
features = tfidf.fit_transform(data['processed_review']).toarray()


In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Using Linear Regression for a regression problem
model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)



Mean Squared Error: 0.23241438292119138


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity # Import cosine_similarity

tfidf = TfidfVectorizer(max_features=1000)
features = tfidf.fit_transform(data['processed_review']).toarray()

def find_similar_reviews(query):
    query_vec = tfidf.transform([preprocess_text(query)])
    sims = cosine_similarity(query_vec, features) # Now cosine_similarity is available
    # Fetch the top 5 most similar reviews
    indices = np.argsort(sims[0])[::-1][:5]
    return data.iloc[indices][['product_name', 'review_content', 'rating']]

# Example usage
query = "what are the products that have highest ratings?"
similar_reviews = find_similar_reviews(query)
print(similar_reviews)

                                           product_name  \
1965  Ikea 903.391.72 Polypropylene Plastic Solid Be...   
1123  Ikea 903.391.72 Polypropylene Plastic Solid Be...   
164   Belkin USB C to USB-C Fast Charging Type C Cab...   
223   Portronics Konnect L 60W PD Type C to Type C M...   
49    Amazonbasics Micro Usb Fast Charging Cable For...   

                                         review_content rating  
1965  It is Okay so far.,This product was amazing an...    4.6  
1123  It is Okay so far.,This product was amazing an...    4.6  
164   The cable is good slightly thicker than the Ap...    4.5  
223   Good,Using it for Samsung Z Fold 4 with an Amb...    4.1  
49    ABOUT  AMAZONBASICS:xxxxxxxxxxxxxxxxxxxxxxxxxx...    4.2  
