In [4]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
import sklearn.feature_extraction.text

import re

In [5]:
#The goal is to classify and determine/predict whether the reviews are positive or negative based on previously classified data

In [6]:
# Load the file into a DataFrame
data = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
data.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [7]:
data.columns=['Review','Sentiment']
data.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     1000 non-null   object
 1   Sentiment  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [9]:
data.describe()

Unnamed: 0,Sentiment
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [10]:
print(data.isnull().sum())

# Clean the 'Review' column
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = text.lower()                  # Convert to lowercase
    return text

data['Review'] = data['Review'].apply(clean_text)

# Remove unnecessary white spaces from the 'Review' column
data['Review'] = data['Review'].str.strip()

# Optionally, also remove extra spaces between words
data['Review'] = data['Review'].str.replace(r'\s+', ' ', regex=True)

data.head()

Review       0
Sentiment    0
dtype: int64


Unnamed: 0,Review,Sentiment
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(data['Review'],data['Sentiment'],test_size=0.25, random_state=45)

In [12]:
#given this is binary classification (1 or 0 for positive/negative sentiment), the algorithm used is logistic regression (might consider using Naive Bayes later given it's text data)
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Transform text data into numerical data
X_train = vectorizer.fit_transform(X_train)  # Fit and transform training data
X_test = vectorizer.transform(X_test)        # Transform testing data

In [17]:
model.fit(X_train,y_train)

In [19]:
y_pred=model.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.772
[[102  30]
 [ 27  91]]
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       132
           1       0.75      0.77      0.76       118

    accuracy                           0.77       250
   macro avg       0.77      0.77      0.77       250
weighted avg       0.77      0.77      0.77       250



----
New data based on the model above

In [23]:
# Load the file into a DataFrame
new_data = pd.read_csv('moreAmazonDatawoLabel.txt', sep='\t', header=None)
new_data.head()

Unnamed: 0,0
0,"The build quality feels premium, but it doesn’..."
1,It fits perfectly in my bag and doesn’t take u...
2,"After a month of use, it started acting up."
3,The colors on the screen are vivid and clear.
4,I wasn’t impressed with how the buttons are la...


In [24]:
#process the new data and vectorize similar to the original data
new_data.columns=['Review']

print(new_data.head())


# Clean the text
new_data['Review'] = new_data['Review'].str.strip()  # Remove unnecessary white spaces
new_data['Review'] = new_data['Review'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
new_data['Review'] = new_data['Review'].str.lower()  # Convert to lowercase

# Transform new data using the fitted vectorizer
X_new = vectorizer.transform(new_data['Review'])


                                              Review
0  The build quality feels premium, but it doesn’...
1  It fits perfectly in my bag and doesn’t take u...
2        After a month of use, it started acting up.
3      The colors on the screen are vivid and clear.
4  I wasn’t impressed with how the buttons are la...


In [25]:
predictions=model.predict(X_new)
new_data['Senntiment']=predictions
new_data.head()

Unnamed: 0,Review,Senntiment
0,the build quality feels premium but it doesnt ...,0
1,it fits perfectly in my bag and doesnt take up...,0
2,after a month of use it started acting up,0
3,the colors on the screen are vivid and clear,1
4,i wasnt impressed with how the buttons are lai...,0


In [26]:
new_data.to_csv('classfiedAmazonReview.csv',index=False)