In [1]:
!pip install pandas scikit-learn nltk



In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from nltk import download

# Download NLTK resources
download('punkt')
download('stopwords')
download('wordnet')

# Load the dataset and inspect its structure
file_path = "C:/Users/basha/Downloads/NLP Data Set - Dataset 1 .csv"
df = pd.read_csv(file_path, encoding='utf-8')

# Handle any missing values and duplicates
print(df.isnull().sum())

# Preprocess the text data
nltk_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in nltk_stopwords])
    return text

# Apply text preprocessing
df['Cleaned_Text'] = df['Review Text'].apply(preprocess_text)

# Tokenize the cleaned text
df['Tokens'] = df['Cleaned_Text'].apply(word_tokenize)

# Convert tokenized text into numerical vectors using TF-IDF transformation
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf_vectorizer.fit_transform(df['Cleaned_Text'])
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Analyze the most important features using the TF-IDF matrix
feature_names = tfidf_vectorizer.get_feature_names_out()
coef = model.coef_.flatten()

# Display the top words contributing to positive and negative sentiments
top_positive_words = [feature_names[i] for i in coef.argsort()[-10:][::-1]]
top_negative_words = [feature_names[i] for i in coef.argsort()[:10]]

print("Top Positive Words:", top_positive_words)
print("Top Negative Words:", top_negative_words)

# Process each review
for index, row in df.iterrows():
    print("\nReview:", row['Review Text'])
    print("Tokens:", row['Tokens'])
    
    # Stem each token
    stemmed_tokens = [ps.stem(token) for token in row['Tokens']]
    print("Stemmed Tokens:", stemmed_tokens)
    
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in row['Tokens']]
    print("Lemmatized Tokens:", lemmatized_tokens)


Review Text    0
Rating         0
Sentiment      0
dtype: int64
Accuracy: 0.00
Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       2.0
    Positive       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Confusion Matrix:
[[0 2]
 [0 0]]
Top Positive Words: ['product', 'amazing', 'value', 'buy', 'excellent', 'purchase', 'wonderful', 'money', 'love', 'exceeded']
Top Negative Words: ['disappointed', 'terrible', 'buying', 'experience', 'worth', 'customer', 'service', 'price', 'better', 'quality']

Review: This product is amazing!
Tokens: ['product', 'amazing']
Stemmed Tokens: ['product', 'amaz']
Lemmatized Tokens: ['product', 'amazing']

Review: The quality is very poor.
Tokens: ['quality', 'poor']
Stemmed Tokens: ['qualiti', 'poor']
Lemmatized Tokens: ['quality', 'poo

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\basha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\basha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
