# Introduction
The objective of this project is to analyse the Movie Review dataset available on ZINDI to develop a NLP machine learning model which when given a review sentence, classifies whether the sentence is of positive, negative, or neutral sentiment. 
This objective will be accomplished using the CRISP-DM methodology. 

•	This project builds 4 sentiment analysis models using pre-trained models from huggingface.

•	Huggingface is an AI-powered open-source platform for natural language processing (NLP) that provides access to pre-trained models, datasets, and evaluation metrics. It allows developers to easily train, test and deploy natural language processing models and build applications such as chatbots, language translation, and text summarization.

The models we will be building are:

- [x] Distilbert-Base-uncased: 
- [x] XLNet-Base-cased: 
- [x] Roberta-Base-Uncased:
- [x] CardiffNLP's Twitter RoBERTa Base:


# Installing Dependencies

In [3]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.w

In [4]:
import pandas as pd
import numpy as np
import numpy as np
from datasets import load_metric
import os
import matplotlib.pyplot as plt
import wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from transformers import TrainingArguments

#  Importing Data
The dataset contains movie reviews along with their associated binary sentiment. The core dataset contains 50,000 reviews split evenly into 25k train and 25k test sets. 

The dataset can be downloaded [from Zindi here](https://zindi.africa/competitions/movie-review-sentiment-classification-challenge/data)

The data used in this project is the sole property of Zindi and the competition host. There are restrictions on how to transmit, duplicate, publish, redistribute or otherwise provide or make available any competition data to any party not participating in the Competition (this includes uploading the data to any public site such as Kaggle or GitHub)

In [5]:
df_train = pd.read_csv("Train.csv")
#df_test = pd.read_csv("Test.csv")

FileNotFoundError: ignored

# EDA

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
df_train['sentiment'].value_counts()

In [None]:
df_train.isnull().sum()
#df.isnull()

In [None]:
#positive tweets
df_train[df_train['sentiment'] == "positive"]['content'].values[:1]

In [None]:
#positive tweets
df_train[df_train['sentiment'] == "negative"]['content'].values[:2]

In [None]:
# Let's count the number of tweets by sentiments
sentiment_counts = df_train.groupby(['sentiment']).size()
print(sentiment_counts)

# Let's visualize the sentiments
fig = plt.figure(figsize=(6,6), dpi=100)
ax = plt.subplot(111)
sentiment_counts.plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="")


In [None]:
words = df_train[df_train['sentiment'] == "negative"].to_string()
#Using wordcloud to visualize tweets
#words = df_train['content'].to_string()

stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(words)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#Using wordcloud to visualize tweets
words = df_train['content'].to_string()

stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(words)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Feature Engineering

In [None]:
df_train['label'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
# Add column to replace values for sentiment as a numeric feature
df_train['label1'] = df_train['sentiment'].replace({'positive':1,'negative':0})


In [None]:
df_train

In [None]:
# Split the train data into train, eval
train, eval = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['label'])

In [None]:
# Save splitted subsets
train.to_csv("train_label.csv", index=False)
eval.to_csv("eval_label.csv", index=False)

In [None]:
data = load_dataset('csv', data_files={'train': 'train_label.csv','eval': 'eval_label.csv'}, encoding = "ISO-8859-1")

# Model Creation
We will be using a pre-existing deep learning model that has been trained on a large dataset and adapting it to our dataset. This is known as Fine-tuning. It can be an efficient and effective approach for creating high-performing models for a wide range of natural language processing tasks.

The advantages of fine-tuning a pretrained model include the ability to quickly and efficiently create high-performing models with limited data, as well as the ability to leverage the knowledge and features learned by the pre-existing model, which can lead to better performance on the new task. We will be finituning the following pre-trained models:

- [x] Distilbert-Base-uncased
- [x] XLNet-Base-cased
- [x] Roberta-Base-uncased
- [x] Cardiffnlp Roberta-Base-uncased

# MODEL 1.   Fine-tuning a DistilBERT Model 

---
---
DistilBERT is a transformer-based language model trained on the same large-scale corpus as BERT. The model has achieved state-of-the-art results on various benchmark datasets, while requiring significantly less computational resources and time.

In [None]:
# Loading model tokinizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", num_labels=2)


In [None]:
def transform_labels(label):

    label = label['label']
    num = 0
    if label == 0: #'Negative'
        num = 0
   
    elif label == 1: #'Positive'
        num = 1

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['content'], padding='max_length',truncation=True)

# Change the tweets to tokens that the models can exploit
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['label', 'content', 'sentiment']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

In [None]:
dataset

In [None]:
#defining the training arguments
# Configuring the trianing parameters 
#access_token = "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"
repo_name = "Movie_Review_Sentiment_Analysis"
training_args = TrainingArguments(
    "Movie_Review_Sentiment_Analysis", 
    num_train_epochs=5, 
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #output_dir=repo_name,
    push_to_hub=True,)

In [None]:
from transformers import AutoModelForSequenceClassification

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2,id2label={0: 'negative review', 1: 'positive review'})

In [None]:
#defining the evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['eval'].shuffle(seed=10)

In [None]:
#logging into hugging face account 
from huggingface_hub import notebook_login,  login
#notebook_login()
login(token="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", add_to_git_credential= True)


In [None]:
#trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics)


In [None]:
#converting training data to PyTorch tensors to speed up training and adding padding:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
ASD=ASDFA

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()

In [None]:
# Launch the final evaluation 
trainer.evaluate()
#tokenizer.save_pretrained

In [None]:
#saving model to hub
trainer.push_to_hub()

In [None]:
# Launch the final evaluation 
trainer.evaluate()


# MODEL 2.   Fine-tuning an XLNet Model 

---
---
XLNet is a transformer-based language model that uses a permutation language modeling approach. The model has been pre-trained on a large corpus of text and can be fine-tuned on various downstream NLP tasks such as sentiment analysis, text classification


In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased", num_labels=2)

def transform_labels(label):

    label = label['label']
    num = 0
    if label == 0: #'Negative'
        num = 0
   
    elif label == 1: #'Positive'
        num = 1

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['content'], padding='max_length',truncation=True)

# Change the tweets to tokens that the models can exploit
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['label', 'content', 'sentiment']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)


repo_name = "XLNet_on_Movie_Review_Data"

#defining the training arguments
training_args = TrainingArguments(
    "XLNet_on_Movie_Review_Data", 
    num_train_epochs=4, 
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
)

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2,id2label={0: 'negative review', 1: 'positive review'})

#defining the evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the training 
trainer.train()



# MODEL 3.   Fine-tuning a Roberta Model 

---
---
RoBERTa is a transformer-based language model introduced by Facebook AI in 2019. It is a variant of the Bert model that is pre-trained on a massive amount of text data, including BooksCorpus (800 million words) and the English Wikipedia (2.5 billion words), using a modified version of the BERT pre-training procedure. 


In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base", num_labels=2)

def transform_labels(label):

    label = label['label']
    num = 0
    if label == 0: #'Negative'
        num = 0
   
    elif label == 1: #'Positive'
        num = 1

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['content'], padding='max_length',truncation=True)

# Change the tweets to tokens that the models can exploit
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['label', 'content', 'sentiment']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)


repo_name = "roberta_on_Movie_Review_Data"

#defining the training arguments
training_args = TrainingArguments(
    "roberta_on_Movie_Review_Data", 
    num_train_epochs=4, 
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
)

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

#defining the evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the training 
trainer.train()



# MODEL 4.   Fine-tuning a Cardiffnlp Model 

---
---
CardiffNLP's Twitter RoBERTa Base model is a pre-trained language model for sentiment analysis on Twitter data. It is based on the RoBERTa architecture.The model has also been fine-tuned on several sentiment analysis tasks to improve its performance on Twitter sentiment analysis.


In [None]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=2)

def transform_labels(label):

    label = label['label']
    num = 0
    if label == 0: #'Negative'
        num = 0
   
    elif label == 1: #'Positive'
        num = 1

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['content'], padding='max_length',truncation=True)

# Change the tweets to tokens that the models can exploit
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['label', 'content', 'sentiment']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)


repo_name = "robertaBase_on_Movie_Review_Data"

#defining the training arguments
training_args = TrainingArguments(
    "robertaBase_on_Movie_Review_Data", 
    num_train_epochs=4, 
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
)

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=2,ignore_mismatched_sizes=True)

#defining the evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the training 
trainer.train()



# Creating Gradio APPs

In [None]:
!pip install gradio 

# --Using the inference API
The Hugging Face Inference API is a web service provided by Hugging Face that allows developers to access models already saved on the hub via a simple API. The API can be accessed via a RESTful interface, making it easy to integrate with other systems.

In [None]:
import gradio as gr
# Creating a gradio app using the inferene API
App = gr.Interface.load("huggingface/allevelly/Movie_Review_Sentiment_Analysis",
  title="sentiment analysis of tweets on covid19 Vaccines", description ="sentiment analysis of tweets on covid19 Vaccines using DistilBERT model",
 allow_flagging=False, examples=[["Type your messgage about covid vaccines above"]]
)

App.launch()


# --Using regular inference with pipeline
Using regular inference with pipeline in Gradio allows developers to leverage pre-trained models from the Hugging Face Model Hub and use it to create a web application with a user-friendly interface, where users can input data and get predictions.
Using the Hugging Face Transformers library, we can load a saved model and its corresponding tokenizer, and then creating a pipeline that can be used to perform regular inference (i.e., making predictions on new data) using that model. This pipeline can then be used to create a Gradio app, which allows users to interact with the model through an intuitive interface.This allows for a more efficient and convenient way to perform inference, as the pipeline can handle tasks such as tokenization, feature extraction, and output formatting. 

In [None]:
from transformers import pipeline
 
# using pipeline for inference and prediction
sentiment_model = pipeline(model="allevelly/Movie_Review_Sentiment_Analysis",tokenizer=tokenizer)
sentiment_model("Recently shown on cable tv the movie opens with a disclaimer distancing itself from any co-operation of real life persons; that in itself is an eye catcher. Yet the script and acting from the main characters is superb and I found myself engrossed throughout.Due in no small way to the crisp, thoughtful and interesting dialogue.The film is about a meeting on one day between two real life musical 'legends' who formerly composed together then seperated.The film captures the essence of their lives and philosophies, in a story which proffers an explanation for their initial 'split'. What is so impressive is that the actors give such seemingly realistic portrayals of the characters they play,faults and all, that this viewer at least was left believing I was witnessing a true event in almost every detail. The great skill of this play is that with astute writing and fine acting a movie basically about 'two of us' talking can make an excellent picture. Worthy of at least an 8 out of 10")
#sentiment_model("Wow, this film was just bloody horrid. SO bad in fact that even though I didn\'t pay to see it, I still wanted my money back.<br /><br />The film is about nothing intelligible. It\'s a mish-mash of sci-fi cliche\'s that were done better by much more skilled film makers. The performances, especially by the leads were over the top in a less endearing Ed Wood sort of way. Speaking of Ed Wood, he\'d be proud of the character\'s dialogue. It\'s just too taciturn with no hint of irony or sense of humor. On top of that, it doesn\'t make sense, nor does the plot, or lackthereof.<br /><br />The visual effects are okay, but not enough to go "oh wow, that\'s cool" and they just seem to be thrown in to "be cool" rather than be a good plot device.<br /><br />The soundtrack was another mishmash of stuff that really never set any sort of mood. Again, it seemed as if the director was just throwing in songs in the film in an effort to "be cool".<br /><br />Which brings me to my final point. Perhaps if the director actually worried more about plot, story and dialogue instead of trying to "be cool", he wouldn\'t have made such a dorky cliche\' of a short film.")


# --Comparing the outputs of the Models using Gradio Parallel
Gradio offers the ability to run "multiple models in parallel" or run multiple models at the same time, side by side, and to compare their predictions on the same input data. The ability to run multiple models in parallel in Gradio allows developers to easily compare the performance of different models on a given task and make informed decisions about which model to use in their application.

In [None]:
import gradio as gr
from gradio.mix import Parallel, Series
app1 = gr.Interface.load("huggingface/allevelly/Movie_Review_Sentiment_Analysis")
app2 =gr.Interface.load("huggingface/allevelly/XLNet_on_Movie_Review_Data")
#app3= gr.Interface(my_language_model,"text","text")
Parallel(app1,app2).launch()

In [None]:
import gradio as gr
# Creating a gradio app using the inferene API
App = gr.Interface.load("huggingface/allevelly/XLNet_on_Movie_Review_Data",
  title="sentiment analysis of tweets on covid19 Vaccines", description ="sentiment analysis of tweets on covid19 Vaccines using DistilBERT model",
 allow_flagging=False, examples=[["Type your messgage about covid vaccines above"]]
)

App.launch()


# Creating the Streamlit APP

In [None]:
!pip install streamlit transformers torch

import streamlit as st
from transformers import pipeline


@st.cache(allow_output_mutation=True)
def load_model():
    model = pipeline('text-classification', model='allevelly/Movie_Review_Sentiment_Analysis')
    return model


def main():
    st.title("Movie Review Sentiment Analysis using Hugging Face API")
    st.write("Enter a movie review to classify its sentiment:")

    # Load the model
    model = load_model()

    # Get user input
    text_input = st.text_input("Input text", value='I loved this movie!')

    # Classify sentiment
    result = model(text_input)

    # Display sentiment
    if result[0]['label'] == 'NEGATIVE':
        st.error(result[0]['score'])
    else:
        st.success(result[0]['score'])

#streamlit run app.py





# Developing an API Endpont using FastAPI


In [None]:
from fastapi import FastAPI
from transformers import pipeline
import numpy as np

app = FastAPI()


model = pipeline('text-classification', model='allevelly/Movie_Review_Sentiment_Analysis')


@app.post('/predict')
async def predict_sentiment(text: str):
    result = model(text)
    sentiment = result[0]['label']
    score = result[0]['score']
    return {'sentiment': sentiment, 'score': score}




In [6]:
pip freeze

absl-py==1.4.0
aeppl==0.0.33
aesara==2.7.9
aiohttp==3.8.4
aiosignal==1.3.1
alabaster==0.7.13
albumentations==1.2.1
altair==4.2.2
appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arviz==0.12.1
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
async-timeout==4.0.2
atomicwrites==1.4.1
attrs==22.2.0
audioread==3.0.0
autograd==1.5
Babel==2.12.1
backcall==0.2.0
backports.zoneinfo==0.2.1
beautifulsoup4==4.6.3
bleach==6.0.0
blis==0.7.9
bokeh==2.4.3
branca==0.6.0
bs4==0.0.1
CacheControl==0.12.11
cachetools==5.3.0
catalogue==2.0.8
certifi==2022.12.7
cffi==1.15.1
cftime==1.6.2
chardet==4.0.0
charset-normalizer==3.0.1
click==8.1.3
clikit==0.6.2
cloudpickle==2.2.1
cmake==3.22.6
cmdstanpy==1.1.0
colorcet==3.0.1
colorlover==0.3.0
community==1.0.0b1
confection==0.0.4
cons==0.4.5
contextlib2==0.5.5
convertdate==2.4.0
crashtest==0.3.1
crcmod==1.7
cufflinks==0.17.3
cupy-cuda11x==11.0.0
cvxopt==1.3.0
cvxpy==1.2.3
cycler==0.11.0
cymem==2.0.7
Cython==0.29.33
dask==2022.2.1
datascience==0.17.6
dat