In [14]:
import re
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import sys  # Import the sys module
from pathlib import Path  # Import the Path class from pathlib module

# Ensure NLTK resources are downloaded
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Add the project root directory to Python path
project_root = str(Path.cwd().parent) if 'notebooks' in str(Path.cwd()) else str(Path.cwd())
if project_root not in sys.path:
    sys.path.append(project_root)

# Custom modules
from src.preprocessing import load_data, preprocess_data, split_data
from src.model import train_model, evaluate_model, save_model, load_model
from src.visualization import plot_confusion_matrix, visualize_shap


In [16]:
# Define paths
RAW_DATA_PATH = r"C:\Users\ELITEBOOK\OneDrive\Desktop\Projects\post-comments-sentiment-Analysis\data\raw\comments_1st.csv"  # Update with your file path
PROCESSED_DATA_PATH = "data/processed/processed_data.csv"
REPORTS_PATH = "reports"

# Ensure the reports directory exists
os.makedirs(REPORTS_PATH, exist_ok=True)

# Load data with specified encoding
data = pd.read_csv(RAW_DATA_PATH, encoding='latin1')
data.head()


Unnamed: 0,Public Identifier,Profile Link,Full Name,Subtitle,Comment Url,comments,Like Count,Comment Count,Is Reply,Is Author,Comment Time
0,ACoAAAArQoYBpAqYrKxJmm8d24JvmnPZJME8u8I,https://www.linkedin.com/in/ACoAAAArQoYBpAqYrK...,Winnie Sun,#WinnieSun ?? ?? 25+ billion impressions share...,https://www.linkedin.com/feed/update/urn:li:ac...,Meeting Marsha Collier in person = priceless! ...,4,4,False,False,"5/23/2018, 2:22:04 AM"
1,ACoAAAATB9sBQ4Lr1QH_HHcaU7nsv0veqUjG0iI,https://www.linkedin.com/in/ACoAAAATB9sBQ4Lr1Q...,Marsha Collier,"47 books: eCommerce, Social Media, Customer Se...",https://www.linkedin.com/feed/update/urn:li:ac...,I figured you'd be working anyway. Thanks so m...,0,0,True,True,"5/26/2018, 1:09:55 AM"
2,ACoAAAAsJKMBhXw2HY7b6BQcG5onjnxpSQusdaw,https://www.linkedin.com/in/ACoAAAAsJKMBhXw2HY...,Brett Gillilan,B2B Consultant,https://www.linkedin.com/feed/update/urn:li:ac...,Do you have the ability to get a full length f...,0,0,False,False,"5/23/2018, 6:08:14 PM"
3,ACoAAAHNFVQBIa-Ul4NAml-iAqsZTAuZvqcGINw,https://www.linkedin.com/in/ACoAAAHNFVQBIa-Ul4...,Melissa Reyes,Operations Manager at CVS Health,https://www.linkedin.com/feed/update/urn:li:ac...,Bummed I am just seeing this! Hope you had a f...,1,1,False,False,"5/25/2018, 2:27:46 PM"
4,ACoAAANmB6kBj8i-jq9oLr67NuxriLKmpuiH6CI,https://www.linkedin.com/in/ACoAAANmB6kBj8i-jq...,Dr. Ai Addyson-Zhang ?? Education Disruptor,?? I help teens & young adults reclaim their c...,https://www.linkedin.com/feed/update/urn:li:ac...,This is soooo awesome! Wish i could join you a...,1,0,False,False,"5/23/2018, 6:50:21 AM"


In [18]:
# Function to map score to sentiment label
def map_score_to_sentiment(score):
    if score > 0.5:
        return 'positive'
    elif score < 0.0:
        return 'negative'
    else:
        return 'neutral'

# Add a new column 'sentiment' with sentiment labels
comments['sentiment'] = comments['score'].apply(map_score_to_sentiment)
comments.head(5)


NameError: name 'comments' is not defined

In [None]:
# Plotting the data
sns.countplot(x='sentiment', data=comment_column)
plt.title

In [None]:
# Conveting our target column "Sentiment" into numerical format. Positve to 2 , Neutral to 1 and negative to 0
comment_column.sentiment.replace("positive", 2, inplace=True)
comment_column.sentiment.replace("neutral", 1, inplace=True)
comment_column.sentiment.replace("negative", 0, inplace=True)
comment_column.head(5)

In [None]:
# @title sentiment

from matplotlib import pyplot as plt
comment_column['sentiment'].plot(kind='line', figsize=(8, 4), title='sentiment')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
# @title score

from matplotlib import pyplot as plt
comment_column['score'].plot(kind='line', figsize=(8, 4), title='score')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
# Text Preprocessing

def preprocess_text(text):
    """
    Preprocesses a given text by:
    1. Removing URLs
    2. Removing HTML tags
    3. Removing noise texts
    4. Converting text to lowercase
    5. Removing punctuation
    6. Removing numbers
    7. Tokenization
    8. Removing stopwords
    9. Stemming
    """
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove noise texts (you can customize this based on your data)
    noise_list = ['@username', 'RT', '&amp;', '...', 'etc.']
    for noise in noise_list:
        text = text.replace(noise, '')

    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Join the stemmed tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)

    # Ensure the preprocessed text contains only alphabetic characters
    preprocessed_text = re.sub(r'[^a-zA-Z\s]', '', preprocessed_text)

    return preprocessed_text.strip()  # Remove leading and trailing whitespaces

# Preprocess the comments
comment_column['preprocessed_comments'] = comment_column['comments'].apply(preprocess_text)
comment_column.head(5)

In [15]:
# Convert preprocessed comments to TF-IDF representation
# Initialize TfidfVectorizer to create TF-IDF representation
vectorizer = CountVectorizer()

# Fit and transform the preprocessed comments to create TF-IDF representation
tfidf_representation = vectorizer.fit_transform(comment_column['preprocessed_comments'])


In [16]:
# Defining input and target variable
X = vectorizer.fit_transform(comment_column['preprocessed_comments'])
y = comment_column['sentiment']

# Splitting the data into training and testing data set
test_size = 0.2
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)


In [None]:
# Initialize the logistic regression classifier
#classifier = RandomForestClassifier(**best_params)
classifier = XGBClassifier(objective='multi:softmax')

# Train the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)


In [None]:
# Calculate confusion matrix for testing data
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Testing Data')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
