In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [21]:
df = pd.read_csv('sdataset.csv')


In [22]:
df

Unnamed: 0.1,Unnamed: 0,category,rating,label,text_
0,0,Home_and_Kitchen_5,5,NOT SPAM,love well made sturdi comfort i love veri pretti
1,1,Home_and_Kitchen_5,5,NOT SPAM,love great upgrad origin i 've mine coupl year
2,2,Home_and_Kitchen_5,5,NOT SPAM,thi pillow save back i love look feel pillow
3,3,Home_and_Kitchen_5,1,NOT SPAM,miss inform use great product price i
4,4,Home_and_Kitchen_5,5,NOT SPAM,veri nice set good qualiti we set two month
...,...,...,...,...,...
40427,40427,Clothing_Shoes_and_Jewelry_5,4,SPAM,i read review say bra ran small i order two ba...
40428,40428,Clothing_Shoes_and_Jewelry_5,5,NOT SPAM,i n't sure exactli would it littl larg small s...
40429,40429,Clothing_Shoes_and_Jewelry_5,2,SPAM,you wear hood wear hood wear jacket without ho...
40430,40430,Clothing_Shoes_and_Jewelry_5,1,NOT SPAM,i like noth dress the reason i gave star i ord...


In [23]:
df = df.dropna(subset=['text_', 'label'])

In [24]:
X = df['text_']
y = df['label']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

In [29]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [30]:
X_test_tfidf = vectorizer.transform(X_test)


In [31]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [32]:
rf_model.fit(X_train_tfidf, y_train)

RandomForestClassifier(random_state=42)

In [33]:
y_pred = rf_model.predict(X_test_tfidf)

In [34]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [35]:
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.8350438976134537
Classification Report:
               precision    recall  f1-score   support

    NOT SPAM       0.81      0.87      0.84      4018
        SPAM       0.86      0.80      0.83      4069

    accuracy                           0.84      8087
   macro avg       0.84      0.84      0.83      8087
weighted avg       0.84      0.84      0.83      8087



In [36]:
new_data = ["order place special photo wall we 've coupl week","easi review the kid dd love thank"]
new_data_tfidf = vectorizer.transform(new_data)
new_predictions = rf_model.predict(new_data_tfidf)
for review, prediction in zip(new_data, new_predictions):
    print(f"Review: {review}")
    print(f"Predicted Label: {prediction}")
    print("------------------------")

Review: order place special photo wall we 've coupl week
Predicted Label: NOT SPAM
------------------------
Review: easi review the kid dd love thank
Predicted Label: SPAM
------------------------


In [37]:
# Assuming 'new_data' is a list of new text reviews
new_data = ["This is a very good.", "babi like thermal insul help keep food warmer the thermal insul help keep.", "Great experience!"]

# Transform the new data using the same TfidfVectorizer
new_data_tfidf = vectorizer.transform(new_data)

# Make predictions on the new data
new_predictions = rf_model.predict(new_data_tfidf)

# Display the predictions
for review, prediction in zip(new_data, new_predictions):
    print(f"Review: {review}")
    print(f"Predicted Label: {prediction}")
    print("------------------------")


Review: This is a very good.
Predicted Label: SPAM
------------------------
Review: babi like thermal insul help keep food warmer the thermal insul help keep.
Predicted Label: NOT SPAM
------------------------
Review: Great experience!
Predicted Label: SPAM
------------------------


In [50]:
new_data = ["product is very bad|","very costly","love product","bad product","worst experience","amazing product","very useful love it","did'nt like product","not working"]
new_data_tfidf = vectorizer.transform(new_data)
new_predictions = rf_model.predict(new_data_tfidf)
for review, prediction in zip(new_data, new_predictions):
    print(f"Review: {review}")
    print(f"Predicted Label: {prediction}")
    print("------------------------")

Review: product is very bad|
Predicted Label: SPAM
------------------------
Review: very costly
Predicted Label: SPAM
------------------------
Review: love product
Predicted Label: NOT SPAM
------------------------
Review: bad product
Predicted Label: SPAM
------------------------
Review: worst experience
Predicted Label: SPAM
------------------------
Review: amazing product
Predicted Label: NOT SPAM
------------------------
Review: very useful love it
Predicted Label: NOT SPAM
------------------------
Review: did'nt like product
Predicted Label: SPAM
------------------------
Review: not working
Predicted Label: SPAM
------------------------
