##### Import the packages

In [2]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

#### Read Train Dataset

In [4]:
df_train = pd.read_csv('D:/data set/amazon_review_polarity_csv/amazon_review_polarity_csv/train.csv', header=None, names=['class_index', 'review_title', 'review_text'])
df_train.index.name = 'review_id'
df_train.head(5)

Unnamed: 0_level_0,class_index,review_title,review_text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


#### Read Test Dataset

In [6]:
df_test = pd.read_csv('D:/data set/amazon_review_polarity_csv/amazon_review_polarity_csv/test.csv', header=None, names=['class_index', 'review_title', 'review_text'])
df_test.index.name = 'review_id'

In [7]:
df_test.head(5)

Unnamed: 0_level_0,class_index,review_title,review_text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


#### inspect the data

In [8]:
df_train.shape

(3600000, 3)

In [9]:
df_test.shape

(400000, 3)

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   class_index   int64 
 1   review_title  object
 2   review_text   object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB


In [11]:
df_train.isnull().sum()

class_index       0
review_title    207
review_text       0
dtype: int64

In [12]:
df_test.isnull().sum()

class_index      0
review_title    24
review_text      0
dtype: int64

In [13]:
df_train.duplicated().sum()

0

In [14]:
df_test.duplicated().sum()

0

In [15]:
# Separate features and target
X_train = df_train['review_text']
X_test = df_test['review_text']
y_train = df_train['class_index']
y_test = df_test['class_index']

In [None]:
# Vectorize the text data using TF-IDF without any cleaning
vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=(1, 2), stop_words='english')

# Fit the vectorizer on training data and transform both train and test sets
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)

In [None]:
# Train the model
model.fit(X_train_tfidf, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(f'Initial Accuracy (no cleaning): {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

In [None]:
# Visualizing the distribution of class labels
sns.countplot(x=y_train)
plt.title("Class Distribution in Training Set")
plt.xlabel("Class Index (1 = Negative, 2 = Positive)")
plt.ylabel("Count")
plt.show()


In [None]:
# Calculate word counts
df_train['word_count'] = df_train['review_text'].apply(lambda x: len(x.split()))

# Plot distribution of word counts
sns.histplot(df_train['word_count'], bins=50, kde=True)
plt.title("Distribution of Word Counts in Reviews")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.show()

In [None]:
from collections import Counter
from wordcloud import WordCloud

# Generate WordCloud for positive reviews
positive_text = " ".join(df_train[df_train['class_index'] == 2]['review_text'])
wordcloud_pos = WordCloud(width=800, height=400, background_color='white').generate(positive_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.title("WordCloud for Positive Reviews")
plt.axis('off')
plt.show()

# Generate WordCloud for negative reviews
negative_text = " ".join(df_train[df_train['class_index'] == 1]['review_text'])
wordcloud_neg = WordCloud(width=800, height=400, background_color='black').generate(negative_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.title("WordCloud for Negative Reviews")
plt.axis('off')
plt.show()


In [None]:
# Get feature names and their importance
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_.flatten()

# Sort and get top 20 positive and negative words
top_positive_indices = coefs.argsort()[-20:]
top_negative_indices = coefs.argsort()[:20]

# Plot
plt.figure(figsize=(10, 5))
plt.barh(range(20), coefs[top_positive_indices], align='center', color='green')
plt.yticks(range(20), [feature_names[i] for i in top_positive_indices])
plt.title("Top Positive Words (TF-IDF)")
plt.show()

plt.figure(figsize=(10, 5))
plt.barh(range(20), coefs[top_negative_indices], align='center', color='red')
plt.yticks(range(20), [feature_names[i] for i in top_negative_indices])
plt.title("Top Negative Words (TF-IDF)")
plt.show()


In [None]:
df_train.drop(columns=['word_count'], inplace=True)

In [None]:
df_train['label'] = df_train.class_index.map({2:0, 1:1})

In [None]:
df_train.head(5)

In [None]:
df_train['full_review'] = df_train.review_title.fillna('') + " " + df_train.review_text 

In [None]:
df_train.head(5)

In [None]:
df_train[df_train.review_title.isnull()]

In [None]:
# check avergae lenght of a full review 
full_review_avg_length = np.mean([ len(df_train.full_review[0]) for i in range(len(df_train.full_review))])
print(full_review_avg_length) 

In [None]:
# check for very short reviews 

short_reviews = df_train[df_train['full_review'].apply(lambda x: len(x.split()) < 10)]

short_reviews.label.value_counts()

print(f"Number of short reviews: {len(short_reviews)}")

In [None]:
df_train.head()

In [None]:
# Fill missing values in the review_title column with empty string
df_train['review_title'] = df_train['review_title'].fillna('')
df_test['review_title'] = df_test['review_title'].fillna('')

# Combine review_title and review_text to create a full review column
df_test['full_review'] = df_test['review_title'] + " " + df_test['review_text']


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from cleantext import clean
from joblib import Parallel, delayed
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from tqdm import tqdm

In [None]:
# Download necessary NLTK datasets
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Initialize NLTK lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [None]:
# Custom stopwords to keep (e.g., "not", "no", "nor")
stopwords_to_keep = {"not", "no", "nor"}
custom_stopwords = stop_words - stopwords_to_keep
# Pre-compile regex patterns for optimization
repeated_char_pattern = re.compile(r'(.)\1{3,}')
repeated_word_pattern = re.compile(r'\b(\w+)( \1\b)+')

In [None]:

def clean_text_step(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    # Remove emails
    text = re.sub(r"\S+@\S+\.\S+", "", text)
    
    # Remove phone numbers (basic format)
    text = re.sub(r"\b\d{10,}\b", "", text)  # Remove long sequences of digits
    
    # Remove digits
    text = re.sub(r"\d+", "", text)
    
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    
    return text


### Step 2: Remove Gibberish ###
def remove_gibberish_step(text):
    text = repeated_char_pattern.sub(r'\1', text)
    text = repeated_word_pattern.sub(r'\1', text)
    return text

### Step 3: Tokenization ###
def tokenize_step(text):
    # Tokenize by splitting on whitespace and handling basic punctuation (remove non-alphabetic characters)
    return re.findall(r'\b\w+\b', text)


### Step 4: Lemmatization and Stopword Removal ###
def lemmatize_and_remove_stopwords_step(tokens):
    return [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in custom_stopwords and token.isalpha()]

### Combined Preprocessing Function ###
def preprocess_text(text):
    # Step-by-step preprocessing
    text = clean_text_step(text)
    text = remove_gibberish_step(text)
    tokens = tokenize_step(text)
    tokens = lemmatize_and_remove_stopwords_step(tokens)
    
    # Join tokens back to a single string
    return " ".join(tokens)

In [None]:
df_train['cleaned_text'] = df_train['full_review'].apply(clean_text_step)
df_test['cleaned_text'] = df_test['full_review'].apply(clean_text_step)

In [None]:
df_train['no_gibberish'] = df_train['cleaned_text'].apply(remove_gibberish_step)
df_test['no_gibberish'] = df_test['cleaned_text'].apply(remove_gibberish_step)

In [None]:
df_train['tokens'] = df_train['no_gibberish'].apply(tokenize_step)
df_test['tokens'] = df_test['no_gibberish'].apply(tokenize_step)

In [None]:
df_train['lemmatized_text'] = df_train['full_review'].apply(lambda x: preprocess_text(x))
df_test['lemmatized_text'] = df_test['full_review'].apply(lambda x: preprocess_text(x))

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
# Separate features and target for the cleaned dataset
X_train_clean = df_train['no_gibberish']
X_test_clean = df_test['no_gibberish']
y_train_clean = df_train['class_index']  # Correct column name
y_test_clean = df_test['class_index']    # Correct column name

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Vectorize the cleaned text data using TF-IDF
vectorizer_clean = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    max_features=20000  # Reduce to a smaller size
)

# Fit the vectorizer on cleaned training data and transform both train and test sets
X_train_tfidf_clean = vectorizer_clean.fit_transform(X_train_clean).tocsc()
X_test_tfidf_clean = vectorizer_clean.transform(X_test_clean).tocsc()