In [19]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os
import sys
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Add project root to path for importing custom modules
project_root = str(Path.cwd().parent) if 'notebooks' in str(Path.cwd()) else str(Path.cwd())
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom modules
from src.preprocessing import load_data, preprocess_data, prepare_dataset
from src.model import train_model, evaluate_model, save_model
from src.visualization import plot_confusion_matrix, plot_feature_importance

# Download required NLTK data
import nltk
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

ImportError: cannot import name 'prepare_dataset' from 'src.preprocessing' (c:\Users\ELITEBOOK\OneDrive\Desktop\Projects\post-comments-sentiment-Analysis\src\preprocessing.py)

In [2]:
# Determine the correct path to the data file
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    data_path = current_dir.parent / 'data' / 'raw' / 'comments_1st.csv'
else:
    data_path = current_dir / 'data' / 'raw' / 'comments_1st.csv'

# Load data
print("Loading data...")
print(f"Attempting to load from: {data_path}")
data = pd.read_csv(str(data_path), encoding='Windows-1252', engine='python', on_bad_lines='skip', encoding_errors='replace')

if data is not None:
    print("\nData loaded successfully!")
    print("Dataset Shape:", data.shape)
    print("\nColumns:", data.columns.tolist())
    print("\nMissing Values:\n", data.isnull().sum())
else:
    print("Failed to load data file.")

# Preview the data
print("\nFirst few rows of the data:")
print(data.head())


Loading data...
Attempting to load from: c:\Users\ELITEBOOK\OneDrive\Desktop\Projects\post-comments-sentiment-Analysis\data\raw\comments_1st.csv

Data loaded successfully!
Dataset Shape: (1032, 11)

Columns: ['Public Identifier', 'Profile Link', 'Full Name', 'Subtitle', 'Comment Url', 'comments', 'Like Count', 'Comment Count', 'Is Reply', 'Is Author', 'Comment Time']

Missing Values:
 Public Identifier    0
Profile Link         0
Full Name            0
Subtitle             1
Comment Url          0
comments             0
Like Count           0
Comment Count        0
Is Reply             0
Is Author            0
Comment Time         0
dtype: int64

First few rows of the data:
                         Public Identifier  \
0  ACoAAAArQoYBpAqYrKxJmm8d24JvmnPZJME8u8I   
1  ACoAAAATB9sBQ4Lr1QH_HHcaU7nsv0veqUjG0iI   
2  ACoAAAAsJKMBhXw2HY7b6BQcG5onjnxpSQusdaw   
3  ACoAAAHNFVQBIa-Ul4NAml-iAqsZTAuZvqcGINw   
4  ACoAAANmB6kBj8i-jq9oLr67NuxriLKmpuiH6CI   

                                        

In [3]:
# Comment Length Analysis
data['comment_length'] = data['comments'].str.len()
plt.figure(figsize=(10, 6))
sns.histplot(data=data['comment_length'], bins=50)
plt.title('Distribution of Comment Lengths')
plt.xlabel('Comment Length')
plt.ylabel('Count')
reports_dir = os.path.join(project_root, 'reports')
if not os.path.exists(reports_dir):
    os.makedirs(reports_dir)
plt.savefig(os.path.join(reports_dir, 'comment_length_distribution.png'))
plt.close()


In [4]:
# Sentiment Analysis with VADER
sia = SentimentIntensityAnalyzer()

def assign_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    return scores['compound']

def assign_scores(data):
    data['sentiment'] = data['comments'].apply(assign_sentiment_scores)
    return data

def assign_directions(data):
    data['mood'] = data['sentiment'].apply(lambda x: 'negative' if x < 0.0 else ('neutral' if 0.0 <= x < 0.4 else 'positive'))
    data['target'] = data['sentiment'].apply(lambda x: 2 if x < 0.0 else (1 if 0.0 <= x < 0.4 else 0))
    return data

# Assign sentiment scores and directions
data = assign_scores(data)
data = assign_directions(data)  # Call assign_directions before plotting

# Plot histogram of sentiments
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='mood')  # Changed to countplot for categorical data
plt.title('Distribution of Sentiment Categories')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.savefig(os.path.join(reports_dir, 'sentiment_category_distribution.png'))
plt.close()

# Plot bar chart of sentiment categories
sentiment_counts = data['target'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Sentiment Categories')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.savefig(os.path.join(reports_dir, 'sentiment_category_distribution.png'))
plt.close()

In [5]:
# First, let's check the data types and structure
print("Data info:")
print(data.info())

print("\nSample of comments column:")
print(data['comments'].head(2))

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Public Identifier  1032 non-null   object 
 1   Profile Link       1032 non-null   object 
 2   Full Name          1032 non-null   object 
 3   Subtitle           1031 non-null   object 
 4   Comment Url        1032 non-null   object 
 5   comments           1032 non-null   object 
 6   Like Count         1032 non-null   int64  
 7   Comment Count      1032 non-null   int64  
 8   Is Reply           1032 non-null   bool   
 9   Is Author          1032 non-null   bool   
 10  Comment Time       1032 non-null   object 
 11  comment_length     1032 non-null   int64  
 12  sentiment          1032 non-null   float64
 13  mood               1032 non-null   object 
 14  target             1032 non-null   int64  
dtypes: bool(2), float64(1), int64(4), object(8)
memory usage: 107

In [6]:
# Preprocess the comments
print("Preprocessing comments...")
data = preprocess_data(data)

# Check the results
print("\nFirst few processed comments:")
print(data[['comments', 'processed_comments']].head())

Preprocessing comments...

First few processed comments:
                                            comments  \
0  Meeting Marsha Collier in person = priceless! ...   
1  I figured you'd be working anyway. Thanks so m...   
2  Do you have the ability to get a full length f...   
3  Bummed I am just seeing this! Hope you had a f...   
4  This is soooo awesome! Wish i could join you a...   

                                  processed_comments  
0  meeting marsha collier person priceless excite...  
1             figured working anyway thanks much mel  
2        ability get full length feature film funded  
3                   bummed seeing hope fabulous time  
4     soooo awesome wish could join awesome lady fun  


In [14]:
# Save Processed Data
print("\nSaving processed data...")
processed_data_path = os.path.join(project_root, 'data', 'processed')
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)
data.to_csv(os.path.join(processed_data_path, 'processed_data.csv'), index=False)



Saving processed data...


In [15]:
# Prepare data for training
print("\nSplitting data...")
X_train, X_test, y_train, y_test, vectorizer = prepare_dataset(data)

# Print a few lines of the data
print("\nFirst few lines of training data:")
print(pd.DataFrame(y_train, columns=['target']).head())

# Print class distribution
print("\nClass distribution:")
print(pd.DataFrame(y_train, columns=['target']).target.value_counts())

# Print a classification report
print("\nClassification report:")
print(classification_report(y_train, y_train))

# Print a confusion matrix
print("\nConfusion matrix:")
print(confusion_matrix(y_train, y_train))



Splitting data...


NameError: name 'prepare_dataset' is not defined

In [None]:
# Train Model
print("\nTraining model...")
model = train_model(X_train, y_train)


In [None]:
# Evaluate Model
print("\nEvaluating model...")
evaluate_model(model, X_test, y_test)

# Generate Visualizations
print("\nGenerating visualizations...")
# Plot confusion matrix
plot_confusion_matrix(y_test, model.predict(X_test), labels=['Negative', 'Positive'])
plt.savefig('reports/confusion_matrix.png')
plt.close()

In [None]:
# Plot feature importance
feature_names = ['comment_length', 'compound', 'pos', 'neu', 'neg']
plot_feature_importance(model, feature_names, X_test)

In [None]:
# Save Model
print("\nSaving model...")
if not os.path.exists('models'):
    os.makedirs('models')
save_model(model, 'models/comment_sentiments_model.pkl')