In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd


# Loading the dataset

In [3]:
df = pd.read_csv("/content/drive/MyDrive/dataset/Uber_Reviews.csv")

  df = pd.read_csv("/content/drive/MyDrive/dataset/Uber_Reviews.csv")


In [4]:
df.head(5)

Unnamed: 0,source,review_id,user_name,review_title,review_description,rating,thumbs_up,review_date,developer_response,developer_response_date,appVersion,laguage_code,country_code
0,Google Play,18d6584c-d0e9-4833-a744-f607058aee97,Milky Way,,"Suddenly, the driver can't have my location an...",1,0.0,2023-08-10 17:48:51,,,,en,in
1,Google Play,50a08f18-cece-4ddf-b617-028844c8aa28,Bradlee Severa,,Very cordial.. And helped with a quick turnaro...,5,0.0,2023-08-10 17:38:35,,,4.485.10000,en,in
2,Google Play,b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7,Amit Aggarwal,,Very good experience,5,0.0,2023-08-10 17:38:17,,,4.486.10002,en,in
3,Google Play,502702a9-25ed-4373-a96c-7fa1f06caacd,Bryant Inman,,All I use,5,0.0,2023-08-10 17:37:45,,,4.467.10008,en,in
4,Google Play,f47a3fb6-23db-49bd-9e63-f33c8d724d07,Addie Whittaker,,I have enjoyed traveling by Uber my drivers ha...,5,0.0,2023-08-10 17:36:56,,,4.486.10002,en,in


# Preprocessing

In [5]:
df.shape

(1069616, 13)

In [6]:
df.columns

Index(['source', 'review_id', 'user_name', 'review_title',
       'review_description', 'rating', 'thumbs_up', 'review_date',
       'developer_response', 'developer_response_date', 'appVersion',
       'laguage_code', 'country_code'],
      dtype='object')

In [7]:
df.dtypes

Unnamed: 0,0
source,object
review_id,object
user_name,object
review_title,object
review_description,object
rating,int64
thumbs_up,float64
review_date,object
developer_response,object
developer_response_date,object


# Checking for null values

In [8]:
df.isnull().sum()

Unnamed: 0,0
source,0
review_id,0
user_name,1
review_title,1067436
review_description,169
rating,0
thumbs_up,2180
review_date,0
developer_response,871352
developer_response_date,872338


In [9]:
# Droping columns with large amounts of null values
columns_to_drop = ['review_title', 'developer_response', 'developer_response_date', 'appVersion']
df_cleaned = df.drop(columns=columns_to_drop)

# Checking the new shape of the dataset
print(df_cleaned.shape)


(1069616, 9)


In [10]:
df_cleaned.isnull().sum()

Unnamed: 0,0
source,0
review_id,0
user_name,1
review_description,169
rating,0
thumbs_up,2180
review_date,0
laguage_code,0
country_code,0


In [11]:
# Filling missing values in 'review_description' column with 'No description provided'
df_cleaned['review_description'].fillna('No description provided', inplace=True)

# Filling missing values in 'thumbs_up' column with 0
df_cleaned['thumbs_up'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['review_description'].fillna('No description provided', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['thumbs_up'].fillna(0, inplace=True)


In [12]:
df_cleaned.head()

Unnamed: 0,source,review_id,user_name,review_description,rating,thumbs_up,review_date,laguage_code,country_code
0,Google Play,18d6584c-d0e9-4833-a744-f607058aee97,Milky Way,"Suddenly, the driver can't have my location an...",1,0.0,2023-08-10 17:48:51,en,in
1,Google Play,50a08f18-cece-4ddf-b617-028844c8aa28,Bradlee Severa,Very cordial.. And helped with a quick turnaro...,5,0.0,2023-08-10 17:38:35,en,in
2,Google Play,b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7,Amit Aggarwal,Very good experience,5,0.0,2023-08-10 17:38:17,en,in
3,Google Play,502702a9-25ed-4373-a96c-7fa1f06caacd,Bryant Inman,All I use,5,0.0,2023-08-10 17:37:45,en,in
4,Google Play,f47a3fb6-23db-49bd-9e63-f33c8d724d07,Addie Whittaker,I have enjoyed traveling by Uber my drivers ha...,5,0.0,2023-08-10 17:36:56,en,in


# Separting rows with emojis

In [13]:
import pandas as pd
import re

# Defining a regular expression pattern to match emojis
emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE
)

# Creating a new dataframe with only the rows that contain emojis
df_with_emojis = df_cleaned[df_cleaned['review_description'].apply(lambda x: bool(emoji_pattern.search(str(x))))]

# Displaying the first few rows of the dataframe with emojis
df_with_emojis.head()


Unnamed: 0,source,review_id,user_name,review_description,rating,thumbs_up,review_date,laguage_code,country_code
14,Google Play,472b106a-4b10-4577-b4dc-44197abc5b98,Lashay Russell,Awesome 💯,5,0.0,2023-08-10 17:17:33,en,in
25,Google Play,655939ab-5a34-4788-a3f3-9e30ed8807aa,R&R Kona,Love it! 😎🤙🏽,5,0.0,2023-08-10 16:56:33,en,in
47,Google Play,c536d8c3-2b05-4177-a5ca-b7f0b4cc89a5,GOURAB PAL,This app is very bad my Ran away with money fr...,1,0.0,2023-08-10 16:03:43,en,in
61,Google Play,05b75417-bf92-459c-97dd-a0b2c9ffa4a1,Reena Chauhan,👍,5,0.0,2023-08-10 15:41:41,en,in
63,Google Play,20704767-086e-48ff-ab1b-500f3bb070e2,Trishant Singh,👌,5,0.0,2023-08-10 15:39:33,en,in


In [14]:
pip install emoji




In [15]:
import re
import emoji

# Function to separate text and emojis
def extract_text_and_emojis(review):
    # Regex pattern to match emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags=re.UNICODE)

    # Extracting emojis from the review
    emoji_list = ''.join(emoji_pattern.findall(review))

    # Extracting text by removing emojis
    text_only = emoji_pattern.sub('', review)

    return text_only.strip(), emoji_list

# Applying the function to the dataframe (use your dataframe with emoji reviews)
df_with_emojis['text'], df_with_emojis['emojis'] = zip(*df_with_emojis['review_description'].apply(extract_text_and_emojis))

# Showing the updated dataframe with text and emojis separated
print(df_with_emojis[['review_description', 'text', 'emojis']].head())


                                   review_description  \
14                                          Awesome 💯   
25                                       Love it! 😎🤙🏽   
47  This app is very bad my Ran away with money fr...   
61                                                  👍   
63                                                  👌   

                                                 text emojis  
14                                            Awesome      💯  
25                                         Love it! 🤙     😎🏽  
47  This app is very bad my Ran away with money fr...         
61                                                         👍  
63                                                         👌  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_emojis['text'], df_with_emojis['emojis'] = zip(*df_with_emojis['review_description'].apply(extract_text_and_emojis))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_emojis['text'], df_with_emojis['emojis'] = zip(*df_with_emojis['review_description'].apply(extract_text_and_emojis))


In [16]:
df_with_emojis.head()

Unnamed: 0,source,review_id,user_name,review_description,rating,thumbs_up,review_date,laguage_code,country_code,text,emojis
14,Google Play,472b106a-4b10-4577-b4dc-44197abc5b98,Lashay Russell,Awesome 💯,5,0.0,2023-08-10 17:17:33,en,in,Awesome,💯
25,Google Play,655939ab-5a34-4788-a3f3-9e30ed8807aa,R&R Kona,Love it! 😎🤙🏽,5,0.0,2023-08-10 16:56:33,en,in,Love it! 🤙,😎🏽
47,Google Play,c536d8c3-2b05-4177-a5ca-b7f0b4cc89a5,GOURAB PAL,This app is very bad my Ran away with money fr...,1,0.0,2023-08-10 16:03:43,en,in,This app is very bad my Ran away with money fr...,
61,Google Play,05b75417-bf92-459c-97dd-a0b2c9ffa4a1,Reena Chauhan,👍,5,0.0,2023-08-10 15:41:41,en,in,,👍
63,Google Play,20704767-086e-48ff-ab1b-500f3bb070e2,Trishant Singh,👌,5,0.0,2023-08-10 15:39:33,en,in,,👌


# Text Preprocessing

In [17]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Loading the necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initializing lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for text preprocessing
def preprocess_text(text):
    # Removing emojis using regex
    text = re.sub(r"[^\w\s]", '', text)  # Remove punctuation
    text = re.sub(r"\d+", '', text)      # Remove numbers
    text = text.lower()                   # Lowercase
    words = text.split()                  # Tokenize
    # Remove stopwords and lemmatize
    text = ' '.join(lemmatizer.lemmatize(word) for word in words if word not in stop_words)
    return text

# Applying preprocessing to the 'text' column
df_with_emojis['cleaned_text'] = df_with_emojis['text'].apply(preprocess_text)

# Displaying the cleaned text
print("Cleaned Text:")
print(df_with_emojis[['text', 'cleaned_text']].head())

# Checking for any empty cleaned texts
print("Empty cleaned texts:", df_with_emojis[df_with_emojis['cleaned_text'] == ''])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned Text:
                                                 text  \
14                                            Awesome   
25                                         Love it! 🤙   
47  This app is very bad my Ran away with money fr...   
61                                                      
63                                                      

                                         cleaned_text  
14                                            awesome  
25                                               love  
47  app bad ran away money uber man ran away money...  
61                                                     
63                                                     
Empty cleaned texts:               source                             review_id  \
61       Google Play  05b75417-bf92-459c-97dd-a0b2c9ffa4a1   
63       Google Play  20704767-086e-48ff-ab1b-500f3bb070e2   
94       Google Play  3719ddae-4e99-4251-a408-cf9e5ca938d1   
112      Google Play  53091449-ff5d-41

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_emojis['cleaned_text'] = df_with_emojis['text'].apply(preprocess_text)


In [18]:
df_with_emojis.head()

Unnamed: 0,source,review_id,user_name,review_description,rating,thumbs_up,review_date,laguage_code,country_code,text,emojis,cleaned_text
14,Google Play,472b106a-4b10-4577-b4dc-44197abc5b98,Lashay Russell,Awesome 💯,5,0.0,2023-08-10 17:17:33,en,in,Awesome,💯,awesome
25,Google Play,655939ab-5a34-4788-a3f3-9e30ed8807aa,R&R Kona,Love it! 😎🤙🏽,5,0.0,2023-08-10 16:56:33,en,in,Love it! 🤙,😎🏽,love
47,Google Play,c536d8c3-2b05-4177-a5ca-b7f0b4cc89a5,GOURAB PAL,This app is very bad my Ran away with money fr...,1,0.0,2023-08-10 16:03:43,en,in,This app is very bad my Ran away with money fr...,,app bad ran away money uber man ran away money...
61,Google Play,05b75417-bf92-459c-97dd-a0b2c9ffa4a1,Reena Chauhan,👍,5,0.0,2023-08-10 15:41:41,en,in,,👍,
63,Google Play,20704767-086e-48ff-ab1b-500f3bb070e2,Trishant Singh,👌,5,0.0,2023-08-10 15:39:33,en,in,,👌,


In [19]:
# Checking for any empty cleaned texts
empty_cleaned_texts = df_with_emojis[df_with_emojis['cleaned_text'] == '']
print("Empty cleaned texts:")
print(empty_cleaned_texts[['text', 'cleaned_text']])


Empty cleaned texts:
        text cleaned_text
61                       
63                       
94                       
112                      
169                      
...      ...          ...
1066098                  
1066318                  
1066985                  
1066998                  
1067349                  

[7618 rows x 2 columns]


In [20]:
df_with_emojis = df_with_emojis[df_with_emojis['cleaned_text'] != '']


In [21]:
df_with_emojis.shape

(42220, 12)

# Emoji Proprocessing

In [22]:
df_with_emojis['emoji_count'] = df_with_emojis['emojis'].apply(lambda x: len(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_emojis['emoji_count'] = df_with_emojis['emojis'].apply(lambda x: len(x))


In [23]:
df_with_emojis.head()

Unnamed: 0,source,review_id,user_name,review_description,rating,thumbs_up,review_date,laguage_code,country_code,text,emojis,cleaned_text,emoji_count
14,Google Play,472b106a-4b10-4577-b4dc-44197abc5b98,Lashay Russell,Awesome 💯,5,0.0,2023-08-10 17:17:33,en,in,Awesome,💯,awesome,1
25,Google Play,655939ab-5a34-4788-a3f3-9e30ed8807aa,R&R Kona,Love it! 😎🤙🏽,5,0.0,2023-08-10 16:56:33,en,in,Love it! 🤙,😎🏽,love,2
47,Google Play,c536d8c3-2b05-4177-a5ca-b7f0b4cc89a5,GOURAB PAL,This app is very bad my Ran away with money fr...,1,0.0,2023-08-10 16:03:43,en,in,This app is very bad my Ran away with money fr...,,app bad ran away money uber man ran away money...,0
69,Google Play,4ab90844-622e-410b-bed5-903b36e6b947,Shri Khatu Motors,Unnecessarily takes very long route to make ex...,1,0.0,2023-08-10 15:30:24,en,in,Unnecessarily takes very long route to make ex...,😡😡😡,unnecessarily take long route make extra money...,3
77,Google Play,250e459f-d1c5-42f7-9201-74478be3e10c,sai yeswanth,i dont like to review i have paid 271 for 5 km...,1,0.0,2023-08-10 15:17:54,en,in,i dont like to review i have paid 271 for 5 km...,🌟,dont like review paid km customer support well...,1


In [24]:
total_emoji_count = df_with_emojis['emoji_count'].sum()
print("Total emoji count:", total_emoji_count)


Total emoji count: 78180


In [25]:
emoji_frequency = df_with_emojis['emojis'].explode().value_counts()
print(emoji_frequency.head(10))


emojis
👍     7561
      5311
👌     2007
😊     1775
😡     1132
👎      701
😍      576
👍👍     574
🙂      564
😠      449
Name: count, dtype: int64


In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


# Splitting the dataset

In [27]:
# Assuming 'rating' is the target variable for classification
X = df_with_emojis[['cleaned_text', 'emoji_count']]
y = df_with_emojis['rating']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Early Fusion

In [28]:
# TF-IDF Vectorization of cleaned_text
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train['cleaned_text'])
tfidf_test = tfidf_vectorizer.transform(X_test['cleaned_text'])

# Converting emoji_count to array and reshape
emoji_train = X_train['emoji_count'].values.reshape(-1, 1)
emoji_test = X_test['emoji_count'].values.reshape(-1, 1)

# Combining TF-IDF and emoji_count for early fusion
X_train_early = np.hstack((tfidf_train.toarray(), emoji_train))
X_test_early = np.hstack((tfidf_test.toarray(), emoji_test))

# Training Logistic Regression model
model_early = LogisticRegression()
model_early.fit(X_train_early, y_train)

# Predictions and evaluation
y_pred_early = model_early.predict(X_test_early)
accuracy_early_bf = accuracy_score(y_test, y_pred_early)
print("Early Fusion Accuracy:", accuracy_early_bf)
print(classification_report(y_test, y_pred_early))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Early Fusion Accuracy: 0.8257934628138323
              precision    recall  f1-score   support

           1       0.75      0.86      0.80      2074
           2       0.00      0.00      0.00       236
           3       0.00      0.00      0.00       257
           4       0.43      0.01      0.01       508
           5       0.86      0.96      0.91      5369

    accuracy                           0.83      8444
   macro avg       0.41      0.37      0.34      8444
weighted avg       0.75      0.83      0.77      8444



# Late Fusion

In [29]:
# TF-IDF Vectorization for text only
tfidf_train_text = tfidf_vectorizer.fit_transform(X_train['cleaned_text'])
tfidf_test_text = tfidf_vectorizer.transform(X_test['cleaned_text'])

# Training model for text modality
model_text = LogisticRegression()
model_text.fit(tfidf_train_text, y_train)

# Training model for emoji modality
model_emoji = LogisticRegression()
model_emoji.fit(emoji_train, y_train)

# Predictions
y_pred_text = model_text.predict(tfidf_test_text)
y_pred_emoji = model_emoji.predict(emoji_test)

# Combining predictions (simple averaging)
y_pred_late = np.round((y_pred_text + y_pred_emoji) / 2).astype(int)

# Evaluation
accuracy_late_bf = accuracy_score(y_test, y_pred_late)
print("Late Fusion Accuracy:", accuracy_late_bf)
print(classification_report(y_test, y_pred_late))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Late Fusion Accuracy: 0.6316911416390336
              precision    recall  f1-score   support

           1       0.89      0.02      0.03      2074
           2       0.00      0.00      0.00       236
           3       0.05      0.51      0.10       257
           4       0.50      0.02      0.03       508
           5       0.86      0.96      0.91      5369

    accuracy                           0.63      8444
   macro avg       0.46      0.30      0.21      8444
weighted avg       0.80      0.63      0.59      8444



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Hybrid Fusion

In [30]:
# Training hybrid model on early fusion
model_hybrid = LogisticRegression()
model_hybrid.fit(X_train_early, y_train)

# Predictions
y_pred_hybrid = model_hybrid.predict(X_test_early)

# Evaluation
accuracy_hybrid_bf = accuracy_score(y_test, y_pred_hybrid)
print("Hybrid Fusion Accuracy:", accuracy_hybrid_bf)
print(classification_report(y_test, y_pred_hybrid))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Hybrid Fusion Accuracy: 0.8257934628138323
              precision    recall  f1-score   support

           1       0.75      0.86      0.80      2074
           2       0.00      0.00      0.00       236
           3       0.00      0.00      0.00       257
           4       0.43      0.01      0.01       508
           5       0.86      0.96      0.91      5369

    accuracy                           0.83      8444
   macro avg       0.41      0.37      0.34      8444
weighted avg       0.75      0.83      0.77      8444



# After Fine Tuning

# Early Fusion

In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(df_with_emojis[['cleaned_text', 'emoji_count']],
                                                    df_with_emojis['rating'], test_size=0.2, random_state=42)

# TF-IDF Vectorization for text modality
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.01, ngram_range=(1, 2), max_features=5000)
tfidf_train = tfidf_vectorizer.fit_transform(X_train['cleaned_text'])
tfidf_test = tfidf_vectorizer.transform(X_test['cleaned_text'])

# Emoji feature scaling
scaler = StandardScaler()
emoji_train_scaled = scaler.fit_transform(X_train[['emoji_count']])
emoji_test_scaled = scaler.transform(X_test[['emoji_count']])

# Concatenating TF-IDF and emoji features for early fusion
X_train_early = np.hstack((tfidf_train.toarray(), emoji_train_scaled))
X_test_early = np.hstack((tfidf_test.toarray(), emoji_test_scaled))

# Logistic Regression model with Grid Search
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search_early = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search_early.fit(X_train_early, y_train)

# Evaluating Early Fusion Model
y_pred_early = grid_search_early.predict(X_test_early)
accuracy_early_af = accuracy_score(y_test, y_pred_early)
print(f"Early Fusion Accuracy After Fine Tuning: {accuracy_early_af}")
print(classification_report(y_test, y_pred_early))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Early Fusion Accuracy After Fine Tuning: 0.8049502605400284
              precision    recall  f1-score   support

           1       0.72      0.80      0.76      2074
           2       0.00      0.00      0.00       236
           3       0.00      0.00      0.00       257
           4       0.00      0.00      0.00       508
           5       0.84      0.96      0.89      5369

    accuracy                           0.80      8444
   macro avg       0.31      0.35      0.33      8444
weighted avg       0.71      0.80      0.75      8444



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Late Fusion

In [34]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Assuming tfidf_train and tfidf_test are ready
# Training a model for the text modality (TF-IDF)
model_text = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model_text.fit(tfidf_train, y_train)

# Training a model for the emoji modality (emoji count)
model_emoji = RandomForestClassifier(n_estimators=100, random_state=42)
model_emoji.fit(emoji_train_scaled, y_train)

# Predict probabilities for each class in both models
pred_text_probs = model_text.predict_proba(tfidf_test)
pred_emoji_probs = model_emoji.predict_proba(emoji_test_scaled)

# Ensure both models produce predictions for all classes
# If they differ, this ensures both arrays have columns for each class present in y_train
classes = np.unique(y_train)
pred_text_probs = pd.DataFrame(pred_text_probs, columns=classes)
pred_emoji_probs = pd.DataFrame(pred_emoji_probs, columns=classes)

# Weighted Late Fusion: Calculate weighted average of predictions for each class
# (Adjust weights as needed)
weights_text = 0.7
weights_emoji = 0.3
pred_late_probs = (weights_text * pred_text_probs) + (weights_emoji * pred_emoji_probs)

# For each sample, choose the class with the highest probability
y_pred_late = pred_late_probs.idxmax(axis=1).astype(int)

# Evaluate Late Fusion Model
accuracy_late_af = accuracy_score(y_test, y_pred_late)
print(f"Late Fusion Accuracy After Fine Tuning: {accuracy_late_af}")
print(classification_report(y_test, y_pred_late))




Late Fusion Accuracy After Fine Tuning: 0.8024632875414496
              precision    recall  f1-score   support

           1       0.76      0.75      0.75      2074
           2       0.00      0.00      0.00       236
           3       0.00      0.00      0.00       257
           4       0.00      0.00      0.00       508
           5       0.82      0.97      0.89      5369

    accuracy                           0.80      8444
   macro avg       0.32      0.34      0.33      8444
weighted avg       0.71      0.80      0.75      8444



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Hybrid Fusion

In [36]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Assuming models for text and emoji are already trained:
# `model_text` is the text model, and `model_emoji` is the emoji model

# Late Fusion Predictions for training data
pred_text_train = model_text.predict_proba(tfidf_train)[:, 1]   # Training set text predictions
pred_emoji_train = model_emoji.predict_proba(emoji_train_scaled)[:, 1]   # Training set emoji predictions

# Late Fusion Predictions for test data
pred_text_test = model_text.predict_proba(tfidf_test)[:, 1]   # Test set text predictions
pred_emoji_test = model_emoji.predict_proba(emoji_test_scaled)[:, 1]   # Test set emoji predictions

# Hybrid Fusion: Combine Early Fusion Features with Late Fusion Predictions
# Assuming `X_train_early` and `X_test_early` are prepared with early fusion features

X_train_hybrid = np.hstack((X_train_early, pred_text_train.reshape(-1, 1), pred_emoji_train.reshape(-1, 1)))
X_test_hybrid = np.hstack((X_test_early, pred_text_test.reshape(-1, 1), pred_emoji_test.reshape(-1, 1)))

# Training a Logistic Regression Model on Hybrid Features
model_hybrid = LogisticRegression()
model_hybrid.fit(X_train_hybrid, y_train)

# Hybrid Fusion Predictions
y_pred_hybrid = model_hybrid.predict(X_test_hybrid)

# Evaluating the Hybrid Fusion Model
accuracy_hybrid_af = accuracy_score(y_test, y_pred_hybrid)
print(f"Hybrid Fusion Accuracy After Fine Tuning: {accuracy_hybrid_af}")
print(classification_report(y_test, y_pred_hybrid))


Hybrid Fusion Accuracy After Fine Tuning: 0.8045949786830886
              precision    recall  f1-score   support

           1       0.72      0.80      0.76      2074
           2       0.00      0.00      0.00       236
           3       0.00      0.00      0.00       257
           4       0.00      0.00      0.00       508
           5       0.84      0.96      0.89      5369

    accuracy                           0.80      8444
   macro avg       0.31      0.35      0.33      8444
weighted avg       0.71      0.80      0.75      8444



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Accuracy pre fine tuning

In [37]:
print(f"Early Fusion Accuracy: {accuracy_early_bf}")
print(f"Late Fusion Accuracy: {accuracy_late_bf}")
print(f"Hybrid Fusion Accuracy: {accuracy_hybrid_bf}")


Early Fusion Accuracy: 0.8257934628138323
Late Fusion Accuracy: 0.6316911416390336
Hybrid Fusion Accuracy: 0.8257934628138323


# Accuracy post fine tuning

In [38]:
print(f"Early Fusion Accuracy: {accuracy_early_af}")
print(f"Late Fusion Accuracy: {accuracy_late_af}")
print(f"Hybrid Fusion Accuracy: {accuracy_hybrid_af}")


Early Fusion Accuracy: 0.8049502605400284
Late Fusion Accuracy: 0.8024632875414496
Hybrid Fusion Accuracy: 0.8045949786830886


# Sentiment Prediction

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Defining sentiment labels based on rating
def map_sentiment(rating):
    if rating in [1, 2]:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    elif rating in [4, 5]:
        return 'positive'

# Applying the function to create a sentiment column
df_with_emojis['sentiment'] = df_with_emojis['rating'].apply(map_sentiment)

# Preparing features using early fusion (text + emoji counts)
X = df_with_emojis[['cleaned_text', 'emoji_count']]

# Vectorizing text features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df_with_emojis['cleaned_text'])

# Combining TF-IDF features with emoji count
X_combined = np.hstack((X_tfidf.toarray(), df_with_emojis[['emoji_count']].values))

# Targetting variable: Sentiment
y = df_with_emojis['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

# Initializing Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predicting sentiment on test data
y_pred = clf.predict(X_test)

# Evaluating the Random Forest model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# Logistic Regression for comparison
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

# Evaluating Logistic Regression model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr}")
print(classification_report(y_test, y_pred_lr))


Random Forest Accuracy: 0.8923495973472289
              precision    recall  f1-score   support

    negative       0.82      0.86      0.84      2407
     neutral       0.09      0.00      0.01       246
    positive       0.93      0.94      0.93      5791

    accuracy                           0.89      8444
   macro avg       0.61      0.60      0.59      8444
weighted avg       0.87      0.89      0.88      8444

Logistic Regression Accuracy: 0.9049028896257698
              precision    recall  f1-score   support

    negative       0.85      0.87      0.86      2407
     neutral       1.00      0.00      0.01       246
    positive       0.93      0.96      0.94      5791

    accuracy                           0.90      8444
   macro avg       0.93      0.61      0.60      8444
weighted avg       0.91      0.90      0.89      8444



In [41]:
# Adding the sentiment predictions from Logistic Regression to the dataframe
df_with_emojis['predicted_sentiment'] = log_reg.predict(X_combined)

# Displaying reviews along with their actual ratings, reviews, and predicted sentiment
reviews_with_sentiment = df_with_emojis[['review_description', 'rating', 'predicted_sentiment']]

# Printing the first few rows
reviews_with_sentiment.head(10)


Unnamed: 0,review_description,rating,predicted_sentiment
14,Awesome 💯,5,positive
25,Love it! 😎🤙🏽,5,positive
47,This app is very bad my Ran away with money fr...,1,negative
69,Unnecessarily takes very long route to make ex...,1,negative
77,i dont like to review i have paid 271 for 5 km...,1,negative
88,Super👍,5,positive
91,The best car service ever👍🏽,5,positive
99,Great 👍,5,positive
109,VERY EFFICIENT & GOT ME THERE ON TIME👍👍❤️😎,5,positive
143,Awesome auto very clean and neet very good beh...,5,positive
