In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os
import sys
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Add project root to path for importing custom modules
project_root = str(Path.cwd().parent) if 'notebooks' in str(Path.cwd()) else str(Path.cwd())
if project_root not in sys.path:
    sys.path.append(project_root)

# Download required NLTK data
import nltk
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

In [None]:
# First check if your preprocessing module is accessible
import src.preprocessing
print(dir(src.preprocessing))  # This will show all available functions

In [None]:
# Determine the correct path to the data file
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    data_path = current_dir.parent / 'data' / 'raw' / 'comments_1st.csv'
else:
    data_path = current_dir / 'data' / 'raw' / 'comments_1st.csv'

# Load data
print("Loading data...")
print(f"Attempting to load from: {data_path}")
data = pd.read_csv(str(data_path), encoding='Windows-1252', engine='python', on_bad_lines='skip', encoding_errors='replace')

if data is not None:
    print("\nData loaded successfully!")
    print("Dataset Shape:", data.shape)
    print("\nColumns:", data.columns.tolist())
    print("\nMissing Values:\n", data.isnull().sum())
else:
    print("Failed to load data file.")

# Preview the data
print("\nFirst few rows of the data:")
print(data.head())


In [36]:
# Comment Length Analysis
data['comment_length'] = data['comments'].str.len()
plt.figure(figsize=(10, 6))
sns.histplot(data=data['comment_length'], bins=50)
plt.title('Distribution of Comment Lengths')
plt.xlabel('Comment Length')
plt.ylabel('Count')
reports_dir = os.path.join(project_root, 'reports')
if not os.path.exists(reports_dir):
    os.makedirs(reports_dir)
plt.savefig(os.path.join(reports_dir, 'comment_length_distribution.png'))
plt.close()


In [37]:
# Sentiment Analysis with VADER
sia = SentimentIntensityAnalyzer()

def assign_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    return scores['compound']

def assign_scores(data):
    data['sentiment'] = data['comments'].apply(assign_sentiment_scores)
    return data

def assign_directions(data):
    data['mood'] = data['sentiment'].apply(lambda x: 'negative' if x < 0.0 else ('neutral' if 0.0 <= x < 0.4 else 'positive'))
    data['target'] = data['sentiment'].apply(lambda x: 2 if x < 0.0 else (1 if 0.0 <= x < 0.4 else 0))
    return data

# Assign sentiment scores and directions
data = assign_scores(data)
data = assign_directions(data)  # Call assign_directions before plotting

# Plot histogram of sentiments
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='mood')  # Changed to countplot for categorical data
plt.title('Distribution of Sentiment Categories')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.savefig(os.path.join(reports_dir, 'sentiment_category_distribution.png'))
plt.close()

# Plot bar chart of sentiment categories
sentiment_counts = data['target'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Sentiment Categories')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.savefig(os.path.join(reports_dir, 'sentiment_category_distribution.png'))
plt.close()

In [None]:
# First, let's check the data types and structure
print("Data info:")
print(data.info())

print("\nSample of comments column:")
print(data['comments'].head(2))

In [None]:
from preprocessing import preprocess_data 

# Preprocess the comments
print("Preprocessing comments...")
data = preprocess_data(data)

# Check the results
print("\nFirst few processed comments:")
print(data[['comments', 'processed_comments']].head())

In [None]:
# Save Processed Data
print("\nSaving processed data...")
processed_data_path = os.path.join(project_root, 'data', 'processed')
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)
data.to_csv(os.path.join(processed_data_path, 'processed_data.csv'), index=False)


In [None]:
from preprocessing import prepare_dataset 

# Prepare data for training
print("\nPreparing dataset...")
X_train, X_test, y_train, y_test, vectorizer = prepare_dataset(data)

# Verify dataset sizes
print("\nFirst few lines of training data:")
print(pd.DataFrame(y_train, columns=['target']).head())

print("\nClass distribution in training data:")
print(pd.DataFrame(y_train, columns=['target']).target.value_counts())

In [None]:
# Import the train_model function
from src.model import train_model

# Train the model
print("\nTraining the XGBoost model...")
model = train_model(X_train, y_train)


In [112]:
from src.model import evaluate_model
from src.visualization import plot_confusion_matrix

# Evaluate the model
print("Evaluating the model...")
eval_results = evaluate_model(model, X_test, y_test)

# Check if eval_results is not None
if eval_results is not None:
    # Plot confusion matrix
    plot_confusion_matrix(y_test, eval_results['predictions'])
    plt.savefig(os.path.join(reports_dir, 'confusion_matrix.png'))
    plt.close()
else:
    print("Error: evaluate_model function returned None.")

Evaluating the model...
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.25      1.00      0.40         5
           2       0.00      0.00      0.00         2

    accuracy                           0.25        20
   macro avg       0.08      0.33      0.13        20
weighted avg       0.06      0.25      0.10        20

Accuracy: 0.25
Error: evaluate_model function returned None.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
import joblib

# Save the model for later use
model_dir = os.path.join(project_root, 'models')
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

model_path = os.path.join(model_dir, 'comment_sentiments_model.pkl')
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")


Model saved to c:\Users\ELITEBOOK\OneDrive\Desktop\Projects\post-comments-sentiment-Analysis\models\comment_sentiments_model.pkl
