# Sentiment Analysis

In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
dataset = pd.read_csv(r"Test.csv")

In [3]:

dataset.head(3)

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0


In [4]:
dataset.shape

(5000, 2)

In [5]:
dataset.dropna(inplace=True)

In [6]:
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
from nltk.tokenize.treebank import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

In [8]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [9]:
from nltk.corpus import stopwords

stop_word_english = stopwords.words('english')

from string import punctuation

extra_word = ["``", "''" ]

stop_word = list(punctuation) + stop_word_english + extra_word



In [10]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# and stemmer.stem(word)

In [11]:
import nltk
# nltk.download('wordnet')


In [12]:
def preprocess_text(text):
         
    tokens = tokenizer.tokenize(text.lower())
    
    # remove stopword and apply lemmatize for root word
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_word and word.isalpha()]
    
    return ' '.join(cleaned_tokens)


dataset['cleaned_text'] = dataset['text'].apply(preprocess_text)

print(dataset[['text', 'cleaned_text', 'label']].head())


                                                text  \
0  I always wrote this series off as being a comp...   
1  1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...   
2  This movie was so poorly written and directed ...   
3  The most interesting thing about Miryang (Secr...   
4  when i first read about "berlin am meer" i did...   

                                        cleaned_text  label  
0  always wrote series complete jim belushi invol...      0  
1  watched purcell typical mary kate ashley fare ...      0  
2  movie poorly written directed fell asleep minu...      0  
3  interesting thing miryang secret sunshine jeon...      1  
4  first read berlin meer expect thought right pe...      0  


In [13]:
dataset.head(3)

Unnamed: 0,text,label,cleaned_text
0,I always wrote this series off as being a comp...,0,always wrote series complete jim belushi invol...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0,watched purcell typical mary kate ashley fare ...
2,This movie was so poorly written and directed ...,0,movie poorly written directed fell asleep minu...


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
x = vectorizer.fit_transform(dataset['cleaned_text'])

In [15]:
dense_matrix = x.toarray()  
print(dense_matrix[:5, :5])


[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [17]:
x.shape

(5000, 32513)

In [16]:
"""Get non-zero elements and their indices"""
# non_zero_elements = x.nonzero()

# # View the non-zero elements and their indices
# for row, col in zip(non_zero_elements[0], non_zero_elements[1]):
#     print(f"Document {row}, Term {col} -> TF-IDF: {x[row, col]}")


'Get non-zero elements and their indices'

In [19]:
y = dataset["label"]

(5000,)

In [20]:
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=6)

# 5 for svc
# 6 for logisticRegression
# 8 for decisiontreeClassifier
# 17 for KNeighborsClassifier
# 3 for RandomForestClassifier


In [21]:
from itertools import combinations

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [22]:
models = {
    'Linear': LogisticRegression() ,
    'MLP': MLPClassifier(max_iter=500), 
    'KNeighbors': KNeighborsClassifier(n_neighbors= 41),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(n_estimators=10),
    'svc': SVC(kernel="sigmoid")
}


lr = LogisticRegression() 
kn = KNeighborsClassifier(n_neighbors= 41)
sv = SVC(kernel="sigmoid")
rfc = RandomForestClassifier(n_estimators=10)
clf = DecisionTreeClassifier()
mlp = MLPClassifier()

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

def classification_error(y_true, y_pred, y_prob=None, average='binary'):
    """
    Compute common classification error metrics.

    Parameters:
    - y_true: Ground truth target values
    - y_pred: Predicted target values
    - y_prob: Predicted probabilities (used for AUC)
    - average: Defines the type of averaging to be performed on multi-class data.
      'binary' for binary classification, 'micro', 'macro', 'weighted', etc., for multi-class.

    Returns:
    - Dictionary containing various classification metrics
    """
    accuracy = accuracy_score(y_true, y_pred) * 100
    precision = precision_score(y_true, y_pred, average=average) * 100
    recall = recall_score(y_true, y_pred, average=average) * 100
    f1 = f1_score(y_true, y_pred, average=average) * 100
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # ROC AUC (Only for binary classification or probability scores)
    if y_prob is not None and average == 'binary':
        roc_auc = roc_auc_score(y_true, y_prob) * 100
    else:
        roc_auc = None
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix,
        'ROC AUC': roc_auc
    }


In [24]:
# Loop through each model
for name, model in models.items():
    
    # Fit the model to the training data
    model.fit(x_train, y_train)
      
    # Predict on training and test data
    train_predictions = model.predict(x_train)
    test_predictions = model.predict(x_test)

    # Get the classification error metrics for both training and test sets
    train_metrics = classification_error(y_train, train_predictions)
    test_metrics = classification_error(y_test, test_predictions)  

    # Print the metrics
    print(f"{name} - Train Metrics: {train_metrics}")
    print(f"{name} - Test Metrics: {test_metrics}\n")


Linear - Train Metrics: {'Accuracy': 94.77142857142857, 'Precision': 94.43820224719101, 'Recall': 95.24079320113314, 'F1 Score': 94.83779971791255, 'Confusion Matrix': array([[1636,   99],
       [  84, 1681]], dtype=int64), 'ROC AUC': None}
Linear - Test Metrics: {'Accuracy': 85.0, 'Precision': 81.75092478421702, 'Recall': 89.5945945945946, 'F1 Score': 85.49323017408123, 'Confusion Matrix': array([[612, 148],
       [ 77, 663]], dtype=int64), 'ROC AUC': None}

MLP - Train Metrics: {'Accuracy': 100.0, 'Precision': 100.0, 'Recall': 100.0, 'F1 Score': 100.0, 'Confusion Matrix': array([[1735,    0],
       [   0, 1765]], dtype=int64), 'ROC AUC': None}
MLP - Test Metrics: {'Accuracy': 83.86666666666667, 'Precision': 83.0238726790451, 'Recall': 84.5945945945946, 'F1 Score': 83.80187416331995, 'Confusion Matrix': array([[632, 128],
       [114, 626]], dtype=int64), 'ROC AUC': None}

KNeighbors - Train Metrics: {'Accuracy': 75.68571428571428, 'Precision': 85.20801232665639, 'Recall': 62.66288

# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression() 

cv_scores = cross_val_score(lr, x_train, y_train, cv=7)

lr.fit(x_train, y_train)

lr.score(x_test, y_test)*100  , lr.score(x_train, y_train)*100

# overfitting = train > test


(85.0, 94.77142857142857)

In [20]:
cv_scores*100, np.mean(cv_scores) * 100

(array([84.8, 84.8, 85. , 85.6, 78.8, 84. , 85.8]), 84.1142857142857)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
# from sklearn.pipeline import make_pipeline

lr = LogisticRegression() 

# cv_scores = cross_val_score(lr, x_train, y_train, cv=7)

kf = KFold(n_splits=5, shuffle=True, random_state=42)


lr.fit(x_train, y_train)

cv_scores = cross_val_score(lr, x_train, y_train, cv=kf, scoring='accuracy')

cv_scores*100, cv_scores.mean()*100,  cv_scores.std()*100

# overfitting = train > test


(array([85.42857143, 86.        , 81.71428571, 82.85714286, 81.85714286]),
 83.57142857142857,
 1.8024926052099988)

In [22]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm2 = confusion_matrix(y_test, lr.predict(x_test))
cm2

array([[612, 148],
       [ 77, 663]], dtype=int64)

In [23]:
from sklearn.metrics import classification_report

y_pred = lr.predict(x_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.81      0.84       760
           1       0.82      0.90      0.85       740

    accuracy                           0.85      1500
   macro avg       0.85      0.85      0.85      1500
weighted avg       0.85      0.85      0.85      1500



# assemble

In [24]:
from sklearn.ensemble import BaggingClassifier

base_model = LogisticRegression()

bag_model = BaggingClassifier(estimator= base_model, n_estimators= 50, random_state= 42, )

cv_scores = cross_val_score(bag_model, x_train, y_train, cv=7)

bag_model.fit(x_train, y_train)

bag_model.score(x_test, y_test)*100  , bag_model.score(x_train, y_train)*100



(84.46666666666667, 93.82857142857142)

In [25]:
from sklearn.ensemble import AdaBoostClassifier

base_model = lr = LogisticRegression()

bag_model = AdaBoostClassifier(estimator= base_model, n_estimators= 50, random_state= 42, )

cv_scores = cross_val_score(bag_model, x_train, y_train, cv=7)

bag_model.fit(x_train, y_train)

bag_model.score(x_test, y_test)*100  , bag_model.score(x_train, y_train)*100





(73.86666666666667, 78.48571428571428)

In [26]:
from sklearn.ensemble import StackingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 

base_modal = [
    ('decision_tree', DecisionTreeClassifier()),
    ('svc', SVC(probability= True))
]

meta_model = LogisticRegression()

stack_model = StackingClassifier(estimators= base_modal, final_estimator= meta_model)

stack_model.fit(x_train, y_train)

stack_model.score(x_test, y_test)*100  , stack_model.score(x_train, y_train)*100

# 85.0666,  99.771428



(85.0, 99.77142857142857)

In [27]:
# from sklearn.model_selection import GridSearchCV

# param_lr = {
#     'max_iter': [100, 200, 500],  # Number of features to select
#     'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
#     'penalty': ['l1', 'l2', 'elasticnet'],  # Regularization types
#     'solver': ['saga', 'liblinear', 'sag', 'lbfgs'],  # Solvers
#     'multi_class': ['auto', 'ovr', 'multinomial']

# }

# grid_search = GridSearchCV(estimator= lr, param_grid= param_lr, cv=2, verbose=1, n_jobs=-1)

# grid_search.fit(x_train, y_train)


# best_params = grid_search.best_params_
# best_params

In [28]:
# for i in np.arange(0.01, 1.0, 0.01):
    
#     lr = LogisticRegression(C=i)
#     lr.fit(x_train, y_train)
#     knp = lr.score(x_test, y_test)*100
#     knp1 = lr.score(x_train, y_train)*100
    
#     print(knp, knp1, {knp1 - knp}, i)

In [29]:
# for i in range(1, 70):
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=i)
#     lr = LogisticRegression()
#     lr.fit(x_train, y_train)
#     knp = lr.score(x_test, y_test)*100
#     knp1 = lr.score(x_train, y_train)*100
    
#     print(knp, knp1, {knp1 - knp}, i)

# check the model output

vr = VotingClassifier

lr = logisticRegression
rfc = RandomForestClassifier
sv = SVC

In [30]:
# Test on new examples
test_texts = [
    "I enjoy during sunny days",
    "I hate coding errors",
    "This man is bad",
    "your rice is nice",
    "anuj is good boy",
]

test_texts_transformed = vectorizer.transform(test_texts)

# Make predictions
predictions = lr.predict(test_texts_transformed)

# Display the predictions
for text, prediction in zip(test_texts, predictions):
    print(f"Text: '{text}' - Prediction: {'Positive' if prediction == 1 else 'Negative'}")

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.