### 1. Gathering Data

In [None]:
import pandas as pd
df=pd.read_csv("../../data/data_processed.csv")
df["lemmatized"]
df["sentiment"]

### 2. Convert Labels to Numerical values

In [None]:
# Convert label to sentiment score
def label_to_score(label):
    if label == 'positive':
        return 1
    elif label == 'neutral':
        return 0.5
    elif label == 'negative':
        return 0
    else:
        return 0.5  

df['sentiment_score'] = df['sentiment'].apply(label_to_score)
df


### 3. Define Fusion Model

In [7]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

class LinearLogisticFusion:
    def __init__(self, alpha=0.5):
        self.alpha = alpha
        self.beta = None
        self.vectorizer = TfidfVectorizer()

    def fit(self, X_text, y):
        # Convert text to vectors
        X = self.vectorizer.fit_transform(X_text)
        n_features = X.shape[1]
        print("numbers of features" , n_features)
        # Initialize weights
        self.beta = np.zeros(n_features + 1)

        # Gradient descent (simplified)
        lr = 0.01 # learning rate
        for _ in range(500):
             # 1. Linear part: z = Xw + b
            z = self.linear_part(X)
            # 2. Apply sigmoid to map linear output -> probability
            p = self.sigmoid(z) # p = 1 / (1 + exp(-z))
            # 3. Compute error (difference between true labels and predicted probs)
            # Rule: error = y - p   (comes from gradient of log-loss)
            error = y - p 
            # 4. Compute gradient for weights (not including bias)
            # Rule: grad = (X^T * error) / N
            grad = X.T.dot(error) / len(y)
            # 5. Update weights using gradient descent
            # Rule: w = w + lr * grad
            self.beta[1:] += lr * np.ravel(grad)
            # 6. Update bias (intercept term)
            # Rule: b = b + lr * mean(error)
            self.beta[0] += lr * np.mean(error)

    def linear_part(self, X):
        # Rule: z = Xw + b
        # z = linear combination of features (like b0 + b1*x1 + b2*x2 ...)
        return np.dot(X, self.beta[1:]) + self.beta[0]

    def sigmoid(self, z):
        # Rule: σ(z) = 1 / (1 + e^(-z))
        # Maps linear output (z) to probability in range (0, 1)
        return 1 / (1 + np.exp(-z))

    def predict(self, X_text):
        X = self.vectorizer.transform(X_text).toarray()
        z = self.linear_part(X)        
        p = self.sigmoid(z)            
        # fusion = α * (linear output) + (1 - α) * (logistic probability)
        fusion = self.alpha * z + (1 - self.alpha) * p  # fusion
        # if fusion > 0.5 → class = 1 (positive)
        # if fusion < 0.5 → class = 0 (negative)
        # if fusion = 0.5 → class = 2 (neutral)
        labels = np.where(fusion > 0.5, 2,
              np.where(fusion < 0.5, 0, 1))

        return labels, fusion


### 4. Test

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(
    df['lemmatized'].astype(str),
    df['sentiment_score'],
    test_size=0.2,
    random_state=42
)
model = LinearLogisticFusion()
model.fit(X_train, y_train)
y_pred, fusion_scores = model.predict(X_test)

print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

numbers of features 148708
