### 1. Gathering Data

In [3]:
import pandas as pd
df=pd.read_csv("../../data/data_processed.csv")
df["lemmatized"]
df["sentiment"]

0        negative
1        negative
2        positive
3        positive
4        positive
           ...   
59995     neutral
59996    negative
59997    positive
59998    positive
59999     neutral
Name: sentiment, Length: 60000, dtype: object

### 2. Convert Labels to Numerical values

In [4]:
# Convert label to sentiment score
def label_to_score(label):
    if label == 'positive':
        return 1
    elif label == 'neutral':
        return 0.5
    elif label == 'negative':
        return 0
    else:
        return 0.5  

df['sentiment_score'] = df['sentiment'].apply(label_to_score)
df


Unnamed: 0.1,Unnamed: 0,review,sentiment,clean_text,no_stopwords,tokenized,lemmatized,sentiment_score
0,0,"""Paula, I may be a bitch, but I'll never be a ...",negative,paula i may be a bitch but ill never be a butc...,paula may bitch ill never butch br br hilariou...,"['paula', 'may', 'bitch', 'ill', 'never', 'but...",paula may bitch ill never butch br br hilariou...,0.0
1,1,Many people here say that this show is for kid...,negative,many people here say that this show is for kid...,many people say show kids hm kid approximately...,"['many', 'people', 'say', 'show', 'kids', 'hm'...",many people say show kid hm kid approximately ...,0.0
2,2,This was a well written tale of the Making of ...,positive,this was a well written tale of the making of ...,well written tale making batman sitcom actuall...,"['well', 'written', 'tale', 'making', 'batman'...",well write tale make batman sitcom actually re...,1.0
3,3,I think this movie is absolutely beautiful. An...,positive,i think this movie is absolutely beautiful and...,think movie absolutely beautiful im not referr...,"['think', 'movie', 'absolutely', 'beautiful', ...",think movie absolutely beautiful im not refer ...,1.0
4,4,The film was very outstanding despite the NC-1...,positive,the film was very outstanding despite the nc17...,film outstanding despite nc17 rating disturbin...,"['film', 'outstanding', 'despite', 'nc17', 'ra...",film outstanding despite nc17 rating disturb s...,1.0
...,...,...,...,...,...,...,...,...
59995,59995,Nothing special.,neutral,nothing special,nothing special,"['nothing', 'special']",nothing special,0.5
59996,59996,Avoid this one! It is a terrible movie. So wha...,negative,avoid this one it is a terrible movie so what ...,avoid one terrible movie exciting pointless mu...,"['avoid', 'one', 'terrible', 'movie', 'excitin...",avoid one terrible movie excite pointless murd...,0.0
59997,59997,This production was quite a surprise for me. I...,positive,this production was quite a surprise for me i ...,production quite surprise absolutely love obsc...,"['production', 'quite', 'surprise', 'absolutel...",production quite surprise absolutely love obsc...,1.0
59998,59998,This is a decent movie. Although little bit sh...,positive,this is a decent movie although little bit sho...,decent movie although little bit short time pa...,"['decent', 'movie', 'although', 'little', 'bit...",decent movie although little bit short time pa...,1.0


### 3. Define Fusion Model

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm  # progress bar

class LinearLogisticFusion:
    def __init__(self, alpha=0.5, epochs=100, lr=0.01):
        self.alpha = alpha
        self.beta = None
        self.vectorizer = TfidfVectorizer(ngram_range=(1,2))
        self.epochs = epochs
        self.lr = lr

    def fit(self, X_text, y):
        # Convert text to vectors
        X = self.vectorizer.fit_transform(X_text)
        n_features = X.shape[1]
        print("Number of features:", n_features)

        # Initialize weights
        self.beta = np.zeros(n_features + 1)

        # Gradient descent with progress bar
        for _ in tqdm(range(self.epochs), desc="Training"):
            # 1. Linear part: z = Xw + b
            z = self.linear_part(X)
            # 2. Sigmoid
            p = self.sigmoid(z)
            # 3. Error
            error = y - p
            # 4. Gradient
            grad = X.T.dot(error) / len(y)
            # 5. Update weights
            self.beta[1:] += self.lr * np.ravel(grad)
            # 6. Update bias
            self.beta[0] += self.lr * np.mean(error)

    def linear_part(self, X):
        return X @ self.beta[1:] + self.beta[0]

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def predict(self, X_text):
        X = self.vectorizer.transform(X_text)
        z = self.linear_part(X)
        p = self.sigmoid(z)

        # Fusion step
        fusion = self.alpha * z + (1 - self.alpha) * p

        # Map fusion -> labels
        labels = np.where(fusion > 0.5, 1.0,
                 np.where(fusion < 0.5, 0.0, 0.5))

        return labels, fusion


Number of features: 29


Training: 100%|██████████| 200/200 [00:00<00:00, 6948.87it/s]


Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       2.0
         1.0       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0






### 4. function for converting scores

In [10]:
def score_to_class(score):
    if score == 0:
        return 0  # negative
    elif score == 0.5:
        return 1  # neutral
    elif score == 1:
        return 2  # positive


### 4. Test

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# تقسيم البيانات
X_train, X_test, y_train, y_test = train_test_split(
    df['lemmatized'].astype(str),
    df['sentiment'],   # labels (negative/neutral/positive)
    test_size=0.2,
    random_state=42
)

# تحويل النصوص إلى features
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# تدريب Logistic Regression
clf = LogisticRegression(max_iter=200, class_weight="balanced")
clf.fit(X_train_features, y_train)

# توقع
y_pred = clf.predict(X_test_features)

# تقرير
print(classification_report(y_test, y_pred, target_names=["negative","neutral","positive"]))


              precision    recall  f1-score   support

    negative       0.91      0.90      0.90      4933
     neutral       1.00      1.00      1.00      2077
    positive       0.90      0.91      0.91      4990

    accuracy                           0.92     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.92      0.92      0.92     12000

