<a href="https://colab.research.google.com/github/abraham35/NLP-Project---Sentiment-Analysis/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

In [2]:
download('punkt')
download('stopwords')
download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import files
uploaded = files.upload()

Saving data.csv to data.csv


In [5]:
# Load your dataset (replace 'your_dataset.csv' with the path to your data file)
df = pd.read_csv('data.csv')

In [6]:
# 1. Data Preprocessing

# Checking for null values and handling them
df['Sentence'] = df['Sentence'].fillna('')  # Fill null values in Sentence column with empty strings
df['Sentiment'] = df['Sentiment'].fillna('neutral')  # Fill null values in Sentiment column with 'neutral'


In [7]:
# Ensuring the Sentiment column has no other than expected values
df['Sentiment'] = df['Sentiment'].apply(lambda x: x.lower() if x in ['positive', 'negative', 'neutral'] else 'neutral')


In [13]:
# 2. Text Preprocessing Functions

# Function to clean text: remove special characters, digits, etc.
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removing non-alphabetical characters
    text = text.lower()  # Convert text to lowercase
    return text

    # Tokenization, removing stopwords, stemming, and lemmatization
def preprocess_text(text):
    # Clean the text
    text = clean_text(text)

    # Tokenization
    words = word_tokenize(text)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming (optional, can be replaced with Lemmatization)
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [16]:
# Apply the preprocessing function to the text column
df['Processed_Sentence'] = df['Sentence'].apply(preprocess_text)

In [15]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [17]:
# 3. Feature Extraction: TF-IDF Vectorization
# Converting the text data into numerical vectors using TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = TfidfVectorizer(max_features=5000)  # You can increase the number of features as needed
X = tfidf.fit_transform(df['Processed_Sentence']).toarray()

In [18]:
# 4. Encoding Labels
# Converting Sentiment labels into numeric values
sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
y = df['Sentiment'].map(sentiment_mapping).values

In [19]:
# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# 6. Model Training using Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [21]:
# 7. Predictions and Evaluation
y_pred = model.predict(X_test)

In [22]:
# Evaluate the model using accuracy and classification report
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.6715141146278871
Classification Report:
               precision    recall  f1-score   support

          -1       0.24      0.15      0.19       175
           0       0.69      0.81      0.75       622
           1       0.78      0.68      0.72       372

    accuracy                           0.67      1169
   macro avg       0.57      0.55      0.55      1169
weighted avg       0.65      0.67      0.66      1169



In [24]:
class_distribution = df['Sentiment'].value_counts()

In [25]:
class_distribution

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
neutral,3130
positive,1852
negative,860


In [None]:
# Handling Class Imbalance:

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [27]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Random Forest with class_weight='balanced' to address class imbalance
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

In [29]:
# Predicting on the test set
y_pred = model.predict(X_test)

In [30]:
# Evaluation
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.6723695466210436
Classification Report:
               precision    recall  f1-score   support

          -1       0.29      0.21      0.24       175
           0       0.69      0.81      0.74       622
           1       0.80      0.66      0.72       372

    accuracy                           0.67      1169
   macro avg       0.59      0.56      0.57      1169
weighted avg       0.66      0.67      0.66      1169



In [32]:
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [38]:
# Split data into features and labels
X = df['Sentence']
y = df['Sentiment']

In [39]:
# Convert Sentiments to numerical values
sentiment_map = {'negative': -1, 'neutral': 0, 'positive': 1}
y = y.map(sentiment_map)

In [40]:
# Vectorize the text using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_vec = vectorizer.fit_transform(X)


In [41]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)


In [42]:
# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [43]:
# Calculate the scale_pos_weight value based on class distribution
neg_count = np.sum(y_train == -1)
pos_count = np.sum(y_train == 1)
scale_pos_weight = neg_count / pos_count  # ratio of negative to positive samples

In [44]:
# Define XGBoost model with class weight scaling (scale_pos_weight)
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,  # Three classes: -1 (negative), 0 (neutral), 1 (positive)
    eval_metric='mlogloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight  # Set scale_pos_weight to address class imbalance
)

In [46]:
# Modify the sentiment map to map -1 (negative) to 0, 0 (neutral) to 1, and 1 (positive) to 2
sentiment_map = {-1: 0, 0: 1, 1: 2}
y = y.map(sentiment_map)  # Apply the mapping to the labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Calculate the scale_pos_weight value based on class distribution
neg_count = np.sum(y_train == 0)  # Count the number of negative samples (mapped to 0)
pos_count = np.sum(y_train == 2)  # Count the number of positive samples (mapped to 2)
scale_pos_weight = neg_count / pos_count  # Ratio of negative to positive samples

# Define XGBoost model with class weight scaling (scale_pos_weight)
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,  # Three classes: 0 (negative), 1 (neutral), 2 (positive)
    eval_metric='mlogloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight  # Set scale_pos_weight to address class imbalance
)

In [47]:
# Train the XGBoost model
model.fit(X_train_res, y_train_res)

Parameters: { "scale_pos_weight" } are not used.



In [48]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [49]:
# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.6911890504704876
Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.31      0.34       175
           1       0.71      0.83      0.77       622
           2       0.79      0.63      0.70       372

    accuracy                           0.69      1169
   macro avg       0.63      0.59      0.60      1169
weighted avg       0.69      0.69      0.68      1169



In [50]:
# Hyperparameter tuning (optional)
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'scale_pos_weight': [1, scale_pos_weight]  # Use the calculated scale_pos_weight
}