In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from collections import Counter
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nlppreprocess import NLP
from nltk.util import ngrams

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [40]:
df = pd.read_csv('/Users/aryan/Coding/Projects/Samsung review project/Data/comments_data.csv')

In [41]:
df.dropna(inplace=True)

In [42]:
nlp = NLP()
df['C_Comment'] = df['Comment'].apply(nlp.process)

In [43]:

def remove_stop_words(text):
    # Get the list of English stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize the text
    word_tokens = text.split()

    # Remove stop words
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]

    # Join the words back into a string
    return ' '.join(filtered_text)

In [44]:
# df['cleaned_comment'] = df['Comment'].apply(lambda x: preprocess_comment(x, preserve_words))
df['cleaned_comment'] = df['C_Comment'].apply(remove_stop_words)

In [45]:
# Sentiment analysis
df['sentiment'] = df['cleaned_comment'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['Polarity'] =  df['cleaned_comment'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

In [46]:
df['words'] = df['cleaned_comment'].apply(lambda x: x.split())

In [47]:
##Data Cleaning : removing some custom rows
dp_1 = df[df['cleaned_comment'].str.contains('ooh')].index.tolist()
dp_2 = df[df['Username'].str.contains('samgold9151')].index.tolist()
drop_index = dp_1 + dp_2
df.drop(drop_index,inplace = True)

In [None]:
# Get top words
positive_words = Counter([word for words in df[df['sentiment_label'] == 'positive']['words'] for word in words])
negative_words = Counter([word for words in df[df['sentiment_label'] == 'negative']['words'] for word in words])

# Visualization
def plot_Wwords(word_counts, title):
    top_words = dict(word_counts.most_common(30))
    wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(top_words)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

def plot_Bwords(words,title):
  words = dict(words)
  plt.figure(figsize=(10,5))
  sns.barplot(x=words.values(),y=words.keys())
  plt.title(title)
  plt.xlabel('Count')
  plt.show()


plot_Wwords(positive_words, 'Top 30 Positive Words')
plot_Wwords(negative_words, 'Top 30 Negative Words')



In [None]:
plot_Bwords(positive_words.most_common(10),'Top 10  positive words')
plot_Bwords(negative_words.most_common(10),'Top 10  negative words')

In [None]:
#most common highly positive and negative words
high_pos = df[df['sentiment'] > 0.75]['words'].copy()
high_neg = df[df['sentiment'] < -0.75]['words'].copy()
high_pos_words = Counter([word for words in high_pos for word in words])
high_neg_words = Counter([word for words in high_neg for word in words])

#Visualizing
plot_Wwords(high_pos_words,'Top 30 highly positive words')
plot_Wwords(high_neg_words,'Top 30 highly negative words')

In [None]:
high_pos_words.most_common(10)

In [None]:
plot_Bwords(high_pos_words.most_common(10),'Top 10 highly positive words')
plot_Bwords(high_neg_words.most_common(10),'Top 10 highly negative words')

In [None]:
positive_bigrams.most_common(10)

In [None]:
# Positive phrases
def get_bigrams(text):
    text = text.lower()
    text = text.split()
    return list(ngrams(text, 2))

df['bigrams'] = df['cleaned_comment'].apply(get_bigrams)

positive_bigrams = Counter([phrase for phrases in df[df['sentiment_label'] == 'positive']['bigrams'] for phrase in phrases])
negative_bigrams = Counter([phrase for phrases in df[df['sentiment_label'] == 'negative']['bigrams'] for phrase in phrases])

#Visualizing
postivive_phrases = {','.join(phrases) : count for phrases,count in positive_bigrams.most_common(10)}
negative_phrases = {','.join(phrases) : count for phrases,count in negative_bigrams.most_common(10)}

def plot_phrases(phrases,title):
  plt.figure(figsize=(10,5))
  sns.barplot(x=phrases.values(),y=phrases.keys())
  plt.title(title)
  plt.xlabel('Count')
  plt.show()

plot_phrases(postivive_phrases,'Top 10 most common positive phraes')
plot_phrases(negative_phrases,'Top 10 most common negative phraes')

In [None]:
# Length vs Sentiment
df['comment_length'] = df['cleaned_comment'].apply(len)
sns.kdeplot(x='comment_length', data=df,hue='sentiment_label',clip=(-10,300))
plt.title('Comment Length vs Sentiment')
plt.show()

In [None]:
# Sentiment Distribution
sns.histplot(df['sentiment'], kde=True).set_yscale('log')
plt.title('Sentiment Score Distribution')
plt.show()

In [None]:
sns.scatterplot(x='sentiment',y='Polarity',data=df,hue='sentiment_label')

In [None]:
df

In [None]:
# Filter negative sentiment rows
negative_df = df[df['sentiment_label'] == 'negative']

# Filter for phone mentions in the cleaned_comment column within the negative DataFrame
filtered_df = negative_df[negative_df['cleaned_comment'].str.contains('stupid')]
filtered_df

In [None]:
df

### Modeling

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


comments = df['cleaned_comment'].values
labels = df['sentiment_label'].values

# 2. Split Data
X_train, X_test, y_train, y_test = train_test_split(comments, labels, test_size=0.2, random_state=42) 

# 3. Feature Extraction with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # You can adjust max_features
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 4. Model Training (Logistic Regression)
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
model.fit(X_train_vec, y_train)

# 5. Prediction and Evaluation
y_pred = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression(max_iter=1000) 
cv_scores = cross_val_score(model, X_train_vec, y_train, cv=5)

print(f"Cross-validation scores: {cv_scores}")
print(f"Average CV accuracy: {cv_scores.mean()}")

In [None]:
(df['sentiment_label'].value_counts()/len(df))*100

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE 


comments = df['cleaned_comment'].values
labels = df['sentiment_label'].values

# 1. Split Data
X_train, X_test, y_train, y_test = train_test_split(comments, labels, test_size=0.2, random_state=42) 

# 2. Feature Extraction with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') 
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. Handle Class Imbalance (SMOTE)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)

# 4. Model Training 

# Logistic Regression
model_lr = LogisticRegression(max_iter=1000, class_weight='balanced')  
model_lr.fit(X_train_resampled, y_train_resampled)

# Decision Tree Classifier
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train_resampled, y_train_resampled)

# 5. Prediction and Evaluation

# Logistic Regression
y_pred_lr = model_lr.predict(X_test_vec)
print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(classification_report(y_test, y_pred_lr))

# Decision Tree
y_pred_dt = model_dt.predict(X_test_vec)
print("\nDecision Tree:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt)}")
print(classification_report(y_test, y_pred_dt))